HOW TO CREATE DATABASE
-----------------------------
CREATE DATABASE IF NOT EXISTS deepak;
CREATE DATABASE IF NOT EXISTS deepak
COMMENT 'My database name is deepak'
LOCATION '/hive/deepak'
WITH DBPROPERTIES ('key1' = 'value1', 'key2' = 'value2');
DESCRIBE DATABASE deepak;
DESCRIBE DATABASE EXTENDED deepak;
HOW TO DROP DATABASE
-----------------------------
DROP DATABASE IF EXISTS deepak;
DROP DATABASE IF EXISTS deepak CASCADE;
DROP DATABASE IF EXISTS deepak RESTRICT;
HOW TO USE DATABASE
-----------------------------
USE deepak;
HOW TO ALTER DATABASE
-----------------------------
ALTER DATABASE deepak SET DBPROPERTIES ('key1' = 'value11', 'key3' = 'value3');
ALTER DATABASE deepak SET OWNER USER hadoop;
ALTER DATABASE deepak SET OWNER ROLE hadoop;
HOW TO CREATE TABLE
-----------------------------
CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
);
CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE TABLE IF NOT EXISTS student2 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE EXTERNAL TABLE IF NOT EXISTS student3 ( name string, id int , course string,
year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE TEMPORARY TABLE IF NOT EXISTS student4 ( name string, id int , course
string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
HOW TO KNOW INFORMATION ABOUT TABLE
-----------------------------
DESCRIBE tbl_name;
DESCRIBE EXTENDED tbl_name;
DESCRIBE FORMATTED tbl_name;
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student1;
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student2;
HOW TO LOAD DATA INTO TABLE
-----------------------------
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student2;
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE
student2;
hadoop fs -mkdir -p /home/deepak/work/hive_inputs
hadoop fs -put /home/deepak/work/hive_inputs/student.txt
/home/deepak/work/hive_inputs/student.txt
LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO TABLE
student2;
hadoop fs -put /home/deepak/work/hive_inputs/student.txt
/home/deepak/work/hive_inputs/student.txt
LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE student2;
HOW TO USE EXTERNAL TABLE
-----------------------------
hadoop fs -mkdir /hive/externaldata
hadoop fs -put /home/deepak/work/hive_inputs/student.txt
/hive/externaldata/student.txt
CREATE EXTERNAL TABLE IF NOT EXISTS studentext1
( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
SELECT * FROM studentext1;
DESCRIBE FORMATTED studentext1;
CREATE EXTERNAL TABLE IF NOT EXISTS studentext2
( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/externaldata';
SELECT * FROM studentext2;
DESCRIBE FORMATTED studentext2;
CREATE TABLE IF NOT EXISTS student
(
name string comment 'student name',
id int comment 'student id',
course string comment 'student course',
year int comment 'student year'
)
COMMENT 'My table name is student'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student;
INSERT INTO TABLE employee VALUES (1, 'mca', ‘1’), (2, 'mca', ‘1’), (3, 'mca',
‘2’), (4, 'mca', ‘2’);
set ;
set hive.cli.print.current.db=true;
set hive.cli.print.header=true;
INSERT INTO TABLE student VALUES ('arun', 1, 'mca', 1), ('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2), ('santosh', 4, 'mca', 2);
UPDATE student SET year = 3, name = 'san' WHERE id = 4 ;
HOW TO VERIFY SCHEMA IN METASTORE DATABASE
------------------------------------------
USE hive_mysql_db;
SELECT * FROM DBS;
SELECT * FROM TBLS;
SELECT * FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;
SELECT DBS.NAME, TBLS.TBL_NAME FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;
COMPLEX DATA TYPES
------------------------------------------
array_type : ARRAY < data_type >
map_type : MAP < primitive_type, data_type >
struct_type : STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type : UNIONTYPE < data_type, data_type, ... >
CREATE TABLE IF NOT EXISTS studentdata
(
name string comment 'student name',
subjects array<string> comment 'student subjects',
marks map<string,int> comment 'student marks',
address struct< loc : string comment 'student location', pincode : int comment
'student pincode', city : string comment 'student city'>,
details uniontype<double, int, array<string>, struct<a:int,b:string>, boolean>
comment 'student details'
)
COMMENT 'My table name is studentdata'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/studentdata'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;
sample data:
--------------------
venkat#math,phy,chem#math:40,phy:23,chem:60#kp,500004,pune#0,75.6
kumar#math,phy,chem#math:50,phy:20,chem:40#mundwa,500005,pune#1,78
rahul#math,phy,chem#math:30,phy:22,chem:30#hinjewadi,500003,pune#2,hi:hello
anil#math,phy,chem#math:13,phy:30,chem:60#tingre nagar,500002,pune#3,1:rank
arun#math,phy,chem#math:10,phy:20,chem:50#aundh,500001,pune#4,true
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/studentdata.txt' OVERWRITE
INTO TABLE studentdata;
SELECT * FROM studentdata;
SELECT name, subjects[0], marks['math'], address.pincode, details FROM
studentdata;
INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata'
SELECT * FROM studentdata;
INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_1'
SELECT name, subjects[0], marks['math'], address.pincode, details FROM
studentdata;
INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_2'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
SELECT * FROM studentdata;
HOW TO CREATE PARTITION TABLE
------------------------------------------
CREATE TABLE IF NOT EXISTS student_partition ( name string , id int )
PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
;
HOW TO WRITE HIVE STATIC PARTITION QUERIES
--------------------------------------------------------------
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_1.txt' OVERWRITE
INTO TABLE student_partition PARTITION(course = 'cse', year = 1);
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_2.txt' OVERWRITE
INTO TABLE student_partition PARTITION(course = 'cse', year = 2);
HOW TO WRITE PARTITION TABLE QUERIES
--------------------------------------------------------------
SHOW PARTITIONS student_partition;
SELECT * FROM student_partition;
SELECT * FROM student_partition WHERE year = 1;
SELECT * FROM student_partition WHERE course = 'cse';
SELECT * FROM student_partition WHERE course = 'mech' and year = 1;
SELECT * FROM student_partition WHERE course = 'mech' or year = 1;
HOW TO WRITE HIVE DYNAMIC PARTITION QUERIES
--------------------------------------------------------------
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE student_partition PARTITION(course, year) select * from
student;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE
student;
INSERT INTO TABLE student_partition PARTITION(course = 'it', year) SELECT name, id,
year FROM student WHERE course = 'it';
INSERT INTO TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT
name, id FROM student WHERE course = 'it' AND year = 3;
INSERT OVERWRITE TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT
name, id FROM student WHERE course = 'it' AND year = 3;
INSERT INTO TABLE student_partition PARTITION(course = 'mca', year = 1) VALUES
('arun', 1), ('anil', 2), ('sudheer', 3), ('santosh', 4);
BELOW HIVE QUERIES ARE NOT ALLOWED
--------------------------------------------------------------
Note:- Dynamic partition cannot be the parent of a static partition '3'
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student_partition PARTITION(course = 'cse', year);
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student_partition PARTITION(course, year = 1);
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student_partition PARTITION(course, year);
INSERT INTO TABLE student_partition PARTITION(course , year = 3) SELECT name, id,
course FROM student WHERE year = 3;
DIFFERENT WAYS TO EXECUTE HIVE QUERIES
--------------------------------------------------------------
hive -e 'USE deepak; SELECT * FROM student;'
hive -S -e 'USE deepak; SELECT * FROM student;'
hadoop@hadoop:~$ hive -f '/home/deepak/work/hive_inputs/partition.hql';
hive> source /home/deepak/work/hive_inputs/partition.hql;
hadoop@hadoop:~$ hadoop fs -ls /
hive> dfs -ls / ;
hadoop@hadoop:~$ ls -l /home
hive> ! ls -l /home;
--------------------------------------------------------------
HOW TO WRITE BUCKET QUERIES IN HIVE
--------------------------------------------------------------
CREATE EXTERNAL TABLE IF NOT EXISTS users ( name string, id int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/users.txt' OVERWRITE INTO
TABLE users;
SELECT * FROM users;
venkat 3
raj 2
appu 4
anil 8
anvith 9
sony 1
nani 6
rani 7
rajesh 10
lg 5
CREATE TABLE IF NOT EXISTS users1 ( name string, id int) CLUSTERED BY (id) INTO 4
BUCKETS;
CREATE TABLE IF NOT EXISTS users2 ( name string, id int) CLUSTERED BY (id) SORTED
BY (id DESC) INTO 5 BUCKETS;
set hive.enforce.bucketing=true;
INSERT OVERWRITE TABLE users1 SELECT * FROM users;
INSERT OVERWRITE TABLE users2 SELECT * FROM users;
or
FROM users
INSERT OVERWRITE TABLE users1 SELECT *
INSERT OVERWRITE TABLE users2 SELECT *;
--------------------------------------------------------------
HOW THE LOGIC OF BUCKET WORKS
--------------------------------------------------------------
x = hash_code(bucket_col) % no_of_buckets
bucket m out of n => if ( x + 1 == m) then print ( bucket_col )
bucket m out of n => if ( x == m - 1) then print ( bucket_col )
1 % 5 => 1
2 % 5 => 2
3 % 5 => 3
4 % 5 => 4
5 % 5 => 0
6 % 5 => 1
7 % 5 => 2
8 % 5 => 3
9 % 5 => 4
10 % 5 => 0
part_0 => 5,10 => bucket 1 out of 5
part_1 => 1,6 => bucket 2 out of 5
part_2 => 2,7 => bucket 3 out of 5
part_3 => 3,8 => bucket 4 out of 5
part_4 => 3,9 => bucket 5 out of 5
1 % 4 => 1
2 % 4 => 2
3 % 4 => 3
4 % 4 => 0
5 % 4 => 1
6 % 4 => 2
7 % 4 => 3
8 % 4 => 0
9 % 4 => 1
10 % 4 => 2
part_0 => 4,8 => bucket 1 out of 4
part_1 => 1,5,9 => bucket 2 out of 4
part_2 => 2,6,10 => bucket 3 out of 4
part_3 => 3,7 => bucket 4 out of 4
1 % 3 => 1
2 % 3 => 2
3 % 3 => 0
4 % 3 => 1
5 % 3 => 2
6 % 3 => 0
7 % 3 => 1
8 % 3 => 2
9 % 3 => 0
10 % 3 => 1
part_0 => 3,6,9 => bucket 1 out of 3
part_1 => 1,4,7,10 => bucket 2 out of 3
part_2 => 2,5,8 => bucket 3 out of 3
1 % 2 => 1
2 % 2 => 0
3 % 2 => 1
4 % 2 => 0
5 % 2 => 1
6 % 2 => 0
7 % 2 => 1
8 % 2 => 0
9 % 2 => 1
10 % 2 => 0
part_0 => 2,4,6,8,10 => bucket 1 out of 2
part_1 => 1,3,5,7,9 => bucket 2 out of 2
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on id );
anil 8
appu 4
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 3 on id );
anvith 9
nani 6
venkat 3
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 2 on id );
anil 8
appu 4
rajesh 10
nani 6
raj 2
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on rand() );
Buckets:
----------------------------------------
1. no. of buckets is fixed
2. we can't change the bucket number later
3. we will use only for static data
Partitions:
----------------------------------------
1. no.of partition columns are fixed
2. we can't change the partition columns later
3. we can use for static / dynamic data also
4. static data (static partition) / dynamic data (dynamic partition)
Hive => MapReduce
----------------------------------------
Partition => MultipleOutputs
Bucket => HashPartitioner
Note: Limitations of `HashPartitioner` is no.of partitions are fixed. we can solve
this problem using `MultipleOutputs`
--------------------------------------------------------------
HOW TO WRITE JOIN QUERIES IN HIVE
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS products ( name string, id int , price int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE TABLE IF NOT EXISTS sales ( name string, year int , percentage double )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/products.txt' OVERWRITE INTO
TABLE products;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/sales.txt' OVERWRITE INTO
TABLE sales;
select * from products;
hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600
spark 5 1500
select * from sales;
hadoop 2010 10.0
hadoop 2011 40.5
hadoop 2012 60.0
hive 2012 40.5
hive 2013 20.0
pig 2014 30.0
pig 2010 10.0
hbase 2014 70.5
mongodb 2013 50.5
--------------------------------------------------------------
REDUCE-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------
HOW TO WRITE INNER JOIN QUERY
--------------------------------------------------------------
SELECT products.* , sales.* from products JOIN sales ON products.name = sales.name;
hadoop 1 1000 hadoop 2010 10.0
hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
HOW TO WRITE LEFT OUTER JOIN QUERY
--------------------------------------------------------------
SELECT products.* , sales.* from products LEFT OUTER JOIN sales ON products.name =
sales.name;
hadoop 1 1000 hadoop 2010 10.0
hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
spark 5 1500 NULL NULL NULL
HOW TO WRITE RIGHT OUTER JOIN QUERY
--------------------------------------------------------------
SELECT products.* , sales.* from products RIGHT OUTER JOIN sales ON products.name =
sales.name;
hadoop 1 1000 hadoop 2010 10.0
hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
NULL NULL NULL mongodb 2013 50.5
HOW TO WRITE FULL OUTER JOIN QUERY
--------------------------------------------------------------
SELECT products.* , sales.* from products FULL OUTER JOIN sales ON products.name =
sales.name;
hadoop 1 1000 hadoop 2012 60.0
hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2010 10.0
hbase 4 600 hbase 2014 70.5
hive 2 500 hive 2013 20.0
hive 2 500 hive 2012 40.5
NULL NULL NULL mongodb 2013 50.5
pig 3 750 pig 2010 10.0
pig 3 750 pig 2014 30.0
spark 5 1500 NULL NULL NULL
--------------------------------------------------------------
MAP-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------
WHICH MAP-SIDE-JOIN QUERIES ARE ALLOWED ?
--------------------------------------------------------------
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products JOIN sales ON
products.name = sales.name;
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products JOIN sales ON
products.name = sales.name;
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;
WHICH MAP-SIDE-JOIN QUERIES ARE NOT ALLOWED ?
--------------------------------------------------------------
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;
Note:
------------
inner join => mapjoin(left table) & mapjoin(right table) are allowed
left outer join => mapjoin(left table) not allowed & mapjoin(right table) allowed
right outer join => mapjoin(left table) allowed & mapjoin(right table) not allowed
full outer join => mapjoin(left table) & mapjoin(right table) are not allowed
--------------------------------------------------------------
SEMI-JOIN QUERIES IN HIVE
--------------------------------------------------------------
SELECT products.* from products LEFT SEMI JOIN sales ON products.name = sales.name;
hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600
--------------------------------------------------------------
HOW TO WRITE CUSTOM UDF QUERIES IN HIVE
--------------------------------------------------------------
SHOW FUNCTIONS;
DESCRIBE FUNCTION split;
DESCRIBE FUNCTION EXTENDED split;
--------------------------------------------------------------
WORDCOUNT SOLUTION IN HIVE USING HIVE FUNCTIONS
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS docs(line string);
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/demoinput' OVERWRITE INTO
TABLE docs;
SELECT * FROM docs;
CREATE TABLE word_counts AS
SELECT word, count(1) as cnt FROM (SELECT explode(split(line,' ')) as word FROM
docs) w
GROUP BY word ORDER BY word;
SELECT * FROM word_counts;
--------------------------------------------------------------
ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;
LIST JARS;
DELETE JAR /home/deepak/work/hive_inputs/hive-examples.jar;
--------------------------------------------------------------
CREATE TEMPORARY FUNCTION md5sum AS
'com.deepak.deepak.hadoop.training.hive.udf.MD5SumUDF';
SELECT name, md5sum(name) FROM student;
DROP TEMPORARY FUNCTION IF EXISTS md5sum;
--------------------------------------------------------------
hadoop fs -put /home/deepak/work/hive_inputs/hive-examples.jar /hive/hive-
examples.jar
CREATE FUNCTION deepak.double_my_data AS
'com.deepak.deepak.hadoop.training.hive.udf.DoubleMyDataUDF' USING JAR
'hdfs:///hive/hive-examples.jar';
SELECT name, deepak.double_my_data(name, ":") FROM student;
DROP FUNCTION IF EXISTS deepak.double_my_data;
--------------------------------------------------------------
HOW TO WRITE SERDE QUERIES IN HIVE
--------------------------------------------------------------
What is a SerDe?
=> SerDe is a short name for "Serializer and Deserializer."
=> Hive uses SerDe (and FileFormat) to read and write table rows.
=> HDFS files --> InputFileFormat --> <key, value> --> Deserializer --> Row
object
=> Row object --> Serializer --> <key, value> --> OutputFileFormat --> HDFS
files
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_regex_serde_new
( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'
OVERWRITE INTO TABLE student_regex_serde_new;
SELECT * FROM student_regex_serde_new WHERE year = 1;
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_regex_serde_old
( name string, id string , course string, year string )
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'
OVERWRITE INTO TABLE student_regex_serde_old;
SELECT * FROM student_regex_serde_old WHERE year = 1;
--------------------------------------------------------------
HOW TO WRITE CUSTOM SERDE QUERIES IN HIVE
--------------------------------------------------------------
ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;
DROP TABLE student_multiply_serde;
CREATE TABLE IF NOT EXISTS student_multiply_serde
( name string, id int , course string, year int )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnMultiplySerDe'
WITH SERDEPROPERTIES ("number.multiply" = "3", "text.multiply" = "2")
;
DESCRIBE FORMATTED student_multiply_serde;
INSERT OVERWRITE TABLE student_multiply_serde SELECT * FROM student WHERE year = 1;
SELECT * FROM student WHERE year = 1;
SELECT * FROM student_multiply_serde WHERE year = 1;
--------------------------------------------------------------
DROP TABLE student_map_serde;
CREATE TABLE IF NOT EXISTS student_map_serde
( name string, id string , course string, year string )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnarMapSerDe'
;
INSERT OVERWRITE TABLE student_map_serde SELECT * FROM student WHERE year = 1;
SELECT * FROM student_map_serde WHERE year = 1;
--------------------------------------------------------------
Regex Serde for apache log
--------------------------------------------------------------
CREATE TABLE apache_log (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|
[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/apache_clf.txt' OVERWRITE INTO
TABLE apache_log;
--------------------------------------------------------------
DIFFERENT WAYS TO STORE DATA USING HIVE QUERIES
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_text
( name string, id int , course string, year int ) STORED AS TEXTFILE;
or
CREATE TABLE IF NOT EXISTS student_text
( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_seq
( name string, id int , course string, year int ) STORED AS SEQUENCEFILE;
or
CREATE TABLE IF NOT EXISTS student_seq
( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_rc
( name string, id int , course string, year int ) STORED AS RCFILE;
or
CREATE TABLE IF NOT EXISTS student_rc
( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileOutputFormat';
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_orc
( name string, id int , course string, year int ) STORED AS ORC;
or
CREATE TABLE IF NOT EXISTS student_orc
( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_avro
( name string, id int , course string, year int ) STORED AS AVRO;
or
CREATE TABLE IF NOT EXISTS student_avro
( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat';
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_parquet
( name string, id int , course string, year int ) STORED AS PARQUET;
or
CREATE TABLE IF NOT EXISTS student_parquet
( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';
--------------------------------------------------------------
FROM STUDENT
INSERT OVERWRITE TABLE student_text SELECT *
INSERT OVERWRITE TABLE student_seq SELECT *
INSERT OVERWRITE TABLE student_rc SELECT *
INSERT OVERWRITE TABLE student_orc SELECT *
INSERT OVERWRITE TABLE student_avro SELECT *
INSERT OVERWRITE TABLE student_parquet SELECT *
;
--------------------------------------------------------------
HOW TO WRITE HIVE QUERIES USING TRANSFORM
--------------------------------------------------------------
SELECT TRANSFORM(name, id, year) USING '/bin/cat' AS (name, id, year) FROM student;
SELECT TRANSFORM(name, id, course) USING '/bin/grep cse' AS (name, id, course) FROM
student; .//grep serching the speciffic word
SELECT TRANSFORM(name, id, year, course) USING 'python
/home/deepak/work/hive_inputs/student.py' AS (name, id, year, course) FROM student;
--------------------------------------------------------------
HOW TO WRITE ALTER QUERIES IN HIVE
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS test ( a string, b int , c int );
CREATE TABLE IF NOT EXISTS test1 LIKE test;
ALTER TABLE test1 RENAME TO mytest;
hive> describe mytest;
a string
b int
c int
ALTER TABLE mytest REPLACE COLUMNS ( x string, y double, a string, b int , c
float);
hive> describe mytest;
x string
y double
a string
b int
c float
ALTER TABLE mytest REPLACE COLUMNS (x string, y double);
hive> describe mytest;
x string
y double
ALTER TABLE mytest ADD COLUMNS (c1 INT COMMENT 'c1 column', c2 STRING COMMENT 'c2
column');
hive> describe mytest;
x string
y double
c1 int c1 column
c2 string c2 column
ALTER TABLE mytest CHANGE c1 c1 STRING COMMENT 'new c1 column';
hive> describe mytest;
x string
y double
c1 string new c1 column
c2 string c2 column
ALTER TABLE mytest CHANGE c1 c11 STRING COMMENT 'new c11 column';
hive> describe mytest;
x string
y double
c11 string new c11 column
c2 string c2 column
ALTER TABLE mytest CHANGE c2 c21 STRING COMMENT 'new c21 column' after x;
hive> describe mytest;
x string
c21 string new c21 column
y double
c11 string new c11 column
ALTER TABLE mytest CHANGE c21 c2 STRING COMMENT 'new c2 column' FIRST;
hive> describe mytest;
c2 string new c2 column
x string
y double
c11 string new c11 column
ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'add comment');
ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'new comment');
ALTER TABLE mytest SET TBLPROPERTIES ('key1' = 'value1', 'key2' = 'value2');
ALTER TABLE mytest SET SERDEPROPERTIES ('field.delim' = ',');
ALTER TABLE mytest SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' WITH
SERDEPROPERTIES ('field.delim' = ':');
ALTER TABLE mytest ADD PARTITION (partCol = 'value1') location 'loc1';
ALTER TABLE mytest ADD PARTITION (partCol = 'value2') location 'loc2';
or
ALTER TABLE mytest ADD
PARTITION (partCol = 'value1') location 'loc1'
PARTITION (partCol = 'value2') location 'loc2';
--------------------------------------------------------------
DIFFERENT WAYS TO RUN HIVE QUERIES
--------------------------------------------------------------
Running Hive CLI
--------------------------------------------------------------
$HIVE_HOME/bin/hive
//diff beel and hive beeli use in cluster jdbc connectivity
Running HiveServer2 and Beeline
--------------------------------------------------------------
$HIVE_HOME/bin/hiveserver2
$HIVE_HOME/bin/beeline -u jdbc:hive2://localhost:10000/default -n deepak -d
org.apache.hive.jdbc.HiveDriver
Go for this more info:
https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients
Running HCatalog
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/hcat_server.sh
$HIVE_HOME/hcatalog/bin/hcat
Running WebHCat
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh start
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh stop
http://localhost:50111/templeton/v1
http://localhost:50111/templeton/v1/ddl/database/default/table/student?
user.name=hadoop
Role Management Commands //authoriazation
--------------------------------------------------------------
CREATE ROLE deepak;
DROP ROLE deepak;
SHOW ROLES;
SHOW CURRENT ROLES;
--------------------------------------------------------------
ACID functionality in HIVE
--------------------------------------------------------------
Add the below properties in "hive-site.xml" file & restart hive server
<property>
<name>hive.in.test</name>
<value>false</value>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>2</value>
</property>
CREATE HIVE TABLE WITH CLUSTERED BY, ORC, TBLPROPERTIES
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_acid
( name string, id int, course string, year int )
CLUSTERED BY (name) INTO 4 BUCKETS
STORED AS ORC
LOCATION '/hive/deepak/student_acid'
TBLPROPERTIES ('transactional' = 'true');
INSERT INTO TABLE student_acid VALUES
('arun', 1, 'mca', 1),
('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2),
('santosh', 4, 'mca', 2)
;
UPDATE student_acid
SET year = 3, course = 'mech'
WHERE id = 4 ;
DELETE FROM student_acid WHERE name = 'anil';
==================================================================
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE
student_partition PARTITION(course, year);
CREATE TABLE IF NOT EXISTS student_partition_test ( name string , id int )
PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student_partition_test'
;
ALTER TABLE student_partition_test ADD PARTITION (course = 'cse' , year = 1)
location '/hive/deepak/student_partition_test/course=cse/year=1';
==================================================================
use deepak;
add jar /home/deepak/work/deepak-pdf-utils.jar;
drop table student_pdf;
CREATE TABLE IF NOT EXISTS student_pdf
( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ("field.delim" = " ")
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
;
LOAD DATA LOCAL INPATH '/home/deepak/work/input/student.pdf' OVERWRITE INTO TABLE
student_pdf;
select * from student_pdf;
INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
SELECT * FROM student_pdf;
INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op-1'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':'
SELECT * FROM student_pdf;
==================================================================
hadoop jar /home/deepak/work/deepak-pdf-utils.jar
com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
file:///home/deepak/work/input/student.pdf file:///home/deepak/work/output/mr-op
hadoop jar /home/deepak/work/deepak-pdf-utils.jar
com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
-Dmapred.pdfoutputformat.separator=*** file:///home/deepak/work/input/student.pdf
file:///home/deepak/work/output/mr-op-1
==================================================================