0% found this document useful (0 votes)
117 views22 pages

Hive Cammand

The document provides instructions on how to perform various operations with databases and tables in Hive including: 1) Creating, altering, dropping, and describing databases and tables. It also covers creating partitioned, bucketed, external, and temporary tables. 2) Loading data into tables from files stored locally or in HDFS. 3) Querying data from tables including filtering on partition keys and joining tables. 4) Complex data types like arrays, maps, structs, and unions that can be used in tables. 5) Different ways to execute Hive queries like using the hive shell command or SQL statements.

Uploaded by

anuja shinde
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
117 views22 pages

Hive Cammand

The document provides instructions on how to perform various operations with databases and tables in Hive including: 1) Creating, altering, dropping, and describing databases and tables. It also covers creating partitioned, bucketed, external, and temporary tables. 2) Loading data into tables from files stored locally or in HDFS. 3) Querying data from tables including filtering on partition keys and joining tables. 4) Complex data types like arrays, maps, structs, and unions that can be used in tables. 5) Different ways to execute Hive queries like using the hive shell command or SQL statements.

Uploaded by

anuja shinde
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 22

HOW TO CREATE DATABASE

-----------------------------
CREATE DATABASE IF NOT EXISTS deepak;

CREATE DATABASE IF NOT EXISTS deepak


COMMENT 'My database name is deepak'
LOCATION '/hive/deepak'
WITH DBPROPERTIES ('key1' = 'value1', 'key2' = 'value2');

DESCRIBE DATABASE deepak;

DESCRIBE DATABASE EXTENDED deepak;

HOW TO DROP DATABASE


-----------------------------
DROP DATABASE IF EXISTS deepak;

DROP DATABASE IF EXISTS deepak CASCADE;

DROP DATABASE IF EXISTS deepak RESTRICT;

HOW TO USE DATABASE


-----------------------------
USE deepak;

HOW TO ALTER DATABASE


-----------------------------
ALTER DATABASE deepak SET DBPROPERTIES ('key1' = 'value11', 'key3' = 'value3');

ALTER DATABASE deepak SET OWNER USER hadoop;

ALTER DATABASE deepak SET OWNER ROLE hadoop;

HOW TO CREATE TABLE


-----------------------------
CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
);

CREATE TABLE IF NOT EXISTS student1 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TABLE IF NOT EXISTS student2 ( name string, id int , course string, year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
CREATE EXTERNAL TABLE IF NOT EXISTS student3 ( name string, id int , course string,
year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TEMPORARY TABLE IF NOT EXISTS student4 ( name string, id int , course
string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

HOW TO KNOW INFORMATION ABOUT TABLE


-----------------------------
DESCRIBE tbl_name;

DESCRIBE EXTENDED tbl_name;

DESCRIBE FORMATTED tbl_name;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student1;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student2;

HOW TO LOAD DATA INTO TABLE


-----------------------------
LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO
TABLE student2;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE


student2;

hadoop fs -mkdir -p /home/deepak/work/hive_inputs

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/home/deepak/work/hive_inputs/student.txt

LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO TABLE


student2;

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/home/deepak/work/hive_inputs/student.txt

LOAD DATA INPATH '/home/deepak/work/hive_inputs/student.txt' INTO TABLE student2;


HOW TO USE EXTERNAL TABLE
-----------------------------
hadoop fs -mkdir /hive/externaldata

hadoop fs -put /home/deepak/work/hive_inputs/student.txt


/hive/externaldata/student.txt

CREATE EXTERNAL TABLE IF NOT EXISTS studentext1


( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

SELECT * FROM studentext1;

DESCRIBE FORMATTED studentext1;

CREATE EXTERNAL TABLE IF NOT EXISTS studentext2


( name string, id int , course string, year int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/externaldata';

SELECT * FROM studentext2;

DESCRIBE FORMATTED studentext2;

CREATE TABLE IF NOT EXISTS student


(
name string comment 'student name',
id int comment 'student id',
course string comment 'student course',
year int comment 'student year'
)
COMMENT 'My table name is student'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student;

INSERT INTO TABLE employee VALUES (1, 'mca', ‘1’), (2, 'mca', ‘1’), (3, 'mca',
‘2’), (4, 'mca', ‘2’);
set ;

set hive.cli.print.current.db=true;

set hive.cli.print.header=true;

INSERT INTO TABLE student VALUES ('arun', 1, 'mca', 1), ('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2), ('santosh', 4, 'mca', 2);

UPDATE student SET year = 3, name = 'san' WHERE id = 4 ;

HOW TO VERIFY SCHEMA IN METASTORE DATABASE


------------------------------------------
USE hive_mysql_db;
SELECT * FROM DBS;
SELECT * FROM TBLS;

SELECT * FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;


SELECT DBS.NAME, TBLS.TBL_NAME FROM DBS join TBLS on DBS.DB_ID = TBLS.DB_ID;

COMPLEX DATA TYPES


------------------------------------------
array_type : ARRAY < data_type >
map_type : MAP < primitive_type, data_type >
struct_type : STRUCT < col_name : data_type [COMMENT col_comment], ...>
union_type : UNIONTYPE < data_type, data_type, ... >

CREATE TABLE IF NOT EXISTS studentdata


(
name string comment 'student name',
subjects array<string> comment 'student subjects',
marks map<string,int> comment 'student marks',
address struct< loc : string comment 'student location', pincode : int comment
'student pincode', city : string comment 'student city'>,
details uniontype<double, int, array<string>, struct<a:int,b:string>, boolean>
comment 'student details'
)
COMMENT 'My table name is studentdata'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/studentdata'
TBLPROPERTIES ('key1'='value1', 'key2'='value2')
;

sample data:
--------------------
venkat#math,phy,chem#math:40,phy:23,chem:60#kp,500004,pune#0,75.6
kumar#math,phy,chem#math:50,phy:20,chem:40#mundwa,500005,pune#1,78
rahul#math,phy,chem#math:30,phy:22,chem:30#hinjewadi,500003,pune#2,hi:hello
anil#math,phy,chem#math:13,phy:30,chem:60#tingre nagar,500002,pune#3,1:rank
arun#math,phy,chem#math:10,phy:20,chem:50#aundh,500001,pune#4,true

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/studentdata.txt' OVERWRITE


INTO TABLE studentdata;

SELECT * FROM studentdata;

SELECT name, subjects[0], marks['math'], address.pincode, details FROM


studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata'


SELECT * FROM studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_1'


SELECT name, subjects[0], marks['math'], address.pincode, details FROM
studentdata;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/hive_inputs/studentdata_2'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
SELECT * FROM studentdata;

HOW TO CREATE PARTITION TABLE


------------------------------------------
CREATE TABLE IF NOT EXISTS student_partition ( name string , id int )
PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
;

HOW TO WRITE HIVE STATIC PARTITION QUERIES


--------------------------------------------------------------
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_1.txt' OVERWRITE
INTO TABLE student_partition PARTITION(course = 'cse', year = 1);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student_cse_2.txt' OVERWRITE


INTO TABLE student_partition PARTITION(course = 'cse', year = 2);

HOW TO WRITE PARTITION TABLE QUERIES


--------------------------------------------------------------
SHOW PARTITIONS student_partition;

SELECT * FROM student_partition;

SELECT * FROM student_partition WHERE year = 1;


SELECT * FROM student_partition WHERE course = 'cse';

SELECT * FROM student_partition WHERE course = 'mech' and year = 1;


SELECT * FROM student_partition WHERE course = 'mech' or year = 1;

HOW TO WRITE HIVE DYNAMIC PARTITION QUERIES


--------------------------------------------------------------
set hive.exec.dynamic.partition.mode=nonstrict;

INSERT OVERWRITE TABLE student_partition PARTITION(course, year) select * from


student;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE


student;

INSERT INTO TABLE student_partition PARTITION(course = 'it', year) SELECT name, id,
year FROM student WHERE course = 'it';

INSERT INTO TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT


name, id FROM student WHERE course = 'it' AND year = 3;

INSERT OVERWRITE TABLE student_partition PARTITION(course = 'ece', year = 3) SELECT


name, id FROM student WHERE course = 'it' AND year = 3;

INSERT INTO TABLE student_partition PARTITION(course = 'mca', year = 1) VALUES


('arun', 1), ('anil', 2), ('sudheer', 3), ('santosh', 4);

BELOW HIVE QUERIES ARE NOT ALLOWED


--------------------------------------------------------------
Note:- Dynamic partition cannot be the parent of a static partition '3'

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course = 'cse', year);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course, year = 1);

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt' OVERWRITE INTO


TABLE student_partition PARTITION(course, year);

INSERT INTO TABLE student_partition PARTITION(course , year = 3) SELECT name, id,


course FROM student WHERE year = 3;

DIFFERENT WAYS TO EXECUTE HIVE QUERIES


--------------------------------------------------------------
hive -e 'USE deepak; SELECT * FROM student;'
hive -S -e 'USE deepak; SELECT * FROM student;'

hadoop@hadoop:~$ hive -f '/home/deepak/work/hive_inputs/partition.hql';


hive> source /home/deepak/work/hive_inputs/partition.hql;

hadoop@hadoop:~$ hadoop fs -ls /


hive> dfs -ls / ;
hadoop@hadoop:~$ ls -l /home
hive> ! ls -l /home;

--------------------------------------------------------------
HOW TO WRITE BUCKET QUERIES IN HIVE
--------------------------------------------------------------

CREATE EXTERNAL TABLE IF NOT EXISTS users ( name string, id int)


ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/users.txt' OVERWRITE INTO


TABLE users;

SELECT * FROM users;

venkat 3
raj 2
appu 4
anil 8
anvith 9
sony 1
nani 6
rani 7
rajesh 10
lg 5

CREATE TABLE IF NOT EXISTS users1 ( name string, id int) CLUSTERED BY (id) INTO 4
BUCKETS;

CREATE TABLE IF NOT EXISTS users2 ( name string, id int) CLUSTERED BY (id) SORTED
BY (id DESC) INTO 5 BUCKETS;

set hive.enforce.bucketing=true;

INSERT OVERWRITE TABLE users1 SELECT * FROM users;


INSERT OVERWRITE TABLE users2 SELECT * FROM users;

or

FROM users
INSERT OVERWRITE TABLE users1 SELECT *
INSERT OVERWRITE TABLE users2 SELECT *;

--------------------------------------------------------------
HOW THE LOGIC OF BUCKET WORKS
--------------------------------------------------------------
x = hash_code(bucket_col) % no_of_buckets

bucket m out of n => if ( x + 1 == m) then print ( bucket_col )


bucket m out of n => if ( x == m - 1) then print ( bucket_col )

1 % 5 => 1
2 % 5 => 2
3 % 5 => 3
4 % 5 => 4
5 % 5 => 0
6 % 5 => 1
7 % 5 => 2
8 % 5 => 3
9 % 5 => 4
10 % 5 => 0

part_0 => 5,10 => bucket 1 out of 5


part_1 => 1,6 => bucket 2 out of 5
part_2 => 2,7 => bucket 3 out of 5
part_3 => 3,8 => bucket 4 out of 5
part_4 => 3,9 => bucket 5 out of 5

1 % 4 => 1
2 % 4 => 2
3 % 4 => 3
4 % 4 => 0
5 % 4 => 1
6 % 4 => 2
7 % 4 => 3
8 % 4 => 0
9 % 4 => 1
10 % 4 => 2

part_0 => 4,8 => bucket 1 out of 4


part_1 => 1,5,9 => bucket 2 out of 4
part_2 => 2,6,10 => bucket 3 out of 4
part_3 => 3,7 => bucket 4 out of 4

1 % 3 => 1
2 % 3 => 2
3 % 3 => 0
4 % 3 => 1
5 % 3 => 2
6 % 3 => 0
7 % 3 => 1
8 % 3 => 2
9 % 3 => 0
10 % 3 => 1

part_0 => 3,6,9 => bucket 1 out of 3


part_1 => 1,4,7,10 => bucket 2 out of 3
part_2 => 2,5,8 => bucket 3 out of 3

1 % 2 => 1
2 % 2 => 0
3 % 2 => 1
4 % 2 => 0
5 % 2 => 1
6 % 2 => 0
7 % 2 => 1
8 % 2 => 0
9 % 2 => 1
10 % 2 => 0

part_0 => 2,4,6,8,10 => bucket 1 out of 2


part_1 => 1,3,5,7,9 => bucket 2 out of 2
SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on id );
anil 8
appu 4

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 3 on id );


anvith 9
nani 6
venkat 3

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 2 on id );


anil 8
appu 4
rajesh 10
nani 6
raj 2

SELECT * FROM users1 TABLESAMPLE ( bucket 1 out of 4 on rand() );

Buckets:
----------------------------------------
1. no. of buckets is fixed
2. we can't change the bucket number later
3. we will use only for static data

Partitions:
----------------------------------------
1. no.of partition columns are fixed
2. we can't change the partition columns later
3. we can use for static / dynamic data also
4. static data (static partition) / dynamic data (dynamic partition)

Hive => MapReduce


----------------------------------------
Partition => MultipleOutputs
Bucket => HashPartitioner

Note: Limitations of `HashPartitioner` is no.of partitions are fixed. we can solve


this problem using `MultipleOutputs`

--------------------------------------------------------------
HOW TO WRITE JOIN QUERIES IN HIVE
--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS products ( name string, id int , price int )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

CREATE TABLE IF NOT EXISTS sales ( name string, year int , percentage double )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/products.txt' OVERWRITE INTO


TABLE products;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/sales.txt' OVERWRITE INTO


TABLE sales;

select * from products;


hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600
spark 5 1500

select * from sales;


hadoop 2010 10.0
hadoop 2011 40.5
hadoop 2012 60.0
hive 2012 40.5
hive 2013 20.0
pig 2014 30.0
pig 2010 10.0
hbase 2014 70.5
mongodb 2013 50.5

--------------------------------------------------------------
REDUCE-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------

HOW TO WRITE INNER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products JOIN sales ON products.name = sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5

HOW TO WRITE LEFT OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products LEFT OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
spark 5 1500 NULL NULL NULL

HOW TO WRITE RIGHT OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products RIGHT OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2010 10.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2012 60.0
hive 2 500 hive 2012 40.5
hive 2 500 hive 2013 20.0
pig 3 750 pig 2014 30.0
pig 3 750 pig 2010 10.0
hbase 4 600 hbase 2014 70.5
NULL NULL NULL mongodb 2013 50.5

HOW TO WRITE FULL OUTER JOIN QUERY


--------------------------------------------------------------
SELECT products.* , sales.* from products FULL OUTER JOIN sales ON products.name =
sales.name;

hadoop 1 1000 hadoop 2012 60.0


hadoop 1 1000 hadoop 2011 40.5
hadoop 1 1000 hadoop 2010 10.0
hbase 4 600 hbase 2014 70.5
hive 2 500 hive 2013 20.0
hive 2 500 hive 2012 40.5
NULL NULL NULL mongodb 2013 50.5
pig 3 750 pig 2010 10.0
pig 3 750 pig 2014 30.0
spark 5 1500 NULL NULL NULL

--------------------------------------------------------------
MAP-SIDE-JOIN QUERIES IN HIVE
--------------------------------------------------------------

WHICH MAP-SIDE-JOIN QUERIES ARE ALLOWED ?


--------------------------------------------------------------
SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products JOIN sales ON
products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products JOIN sales ON


products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;

WHICH MAP-SIDE-JOIN QUERIES ARE NOT ALLOWED ?


--------------------------------------------------------------
SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products LEFT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products RIGHT OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(products) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;

SELECT /*+ MAPJOIN(sales) */ products.* , sales.* from products FULL OUTER JOIN
sales ON products.name = sales.name;

Note:
------------
inner join => mapjoin(left table) & mapjoin(right table) are allowed

left outer join => mapjoin(left table) not allowed & mapjoin(right table) allowed

right outer join => mapjoin(left table) allowed & mapjoin(right table) not allowed

full outer join => mapjoin(left table) & mapjoin(right table) are not allowed

--------------------------------------------------------------
SEMI-JOIN QUERIES IN HIVE
--------------------------------------------------------------
SELECT products.* from products LEFT SEMI JOIN sales ON products.name = sales.name;

hadoop 1 1000
hive 2 500
pig 3 750
hbase 4 600

--------------------------------------------------------------
HOW TO WRITE CUSTOM UDF QUERIES IN HIVE
--------------------------------------------------------------

SHOW FUNCTIONS;

DESCRIBE FUNCTION split;

DESCRIBE FUNCTION EXTENDED split;

--------------------------------------------------------------
WORDCOUNT SOLUTION IN HIVE USING HIVE FUNCTIONS
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS docs(line string);

LOAD DATA LOCAL INPATH '/home/deepak/work/hive_inputs/demoinput' OVERWRITE INTO


TABLE docs;

SELECT * FROM docs;

CREATE TABLE word_counts AS


SELECT word, count(1) as cnt FROM (SELECT explode(split(line,' ')) as word FROM
docs) w
GROUP BY word ORDER BY word;

SELECT * FROM word_counts;

--------------------------------------------------------------

ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;

LIST JARS;

DELETE JAR /home/deepak/work/hive_inputs/hive-examples.jar;

--------------------------------------------------------------

CREATE TEMPORARY FUNCTION md5sum AS


'com.deepak.deepak.hadoop.training.hive.udf.MD5SumUDF';

SELECT name, md5sum(name) FROM student;

DROP TEMPORARY FUNCTION IF EXISTS md5sum;

--------------------------------------------------------------

hadoop fs -put /home/deepak/work/hive_inputs/hive-examples.jar /hive/hive-


examples.jar

CREATE FUNCTION deepak.double_my_data AS


'com.deepak.deepak.hadoop.training.hive.udf.DoubleMyDataUDF' USING JAR
'hdfs:///hive/hive-examples.jar';

SELECT name, deepak.double_my_data(name, ":") FROM student;

DROP FUNCTION IF EXISTS deepak.double_my_data;

--------------------------------------------------------------
HOW TO WRITE SERDE QUERIES IN HIVE
--------------------------------------------------------------
What is a SerDe?
=> SerDe is a short name for "Serializer and Deserializer."
=> Hive uses SerDe (and FileFormat) to read and write table rows.
=> HDFS files --> InputFileFormat --> <key, value> --> Deserializer --> Row
object
=> Row object --> Serializer --> <key, value> --> OutputFileFormat --> HDFS
files

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_regex_serde_new


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'


OVERWRITE INTO TABLE student_regex_serde_new;

SELECT * FROM student_regex_serde_new WHERE year = 1;

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_regex_serde_old


( name string, id string , course string, year string )
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)")
;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student.txt'


OVERWRITE INTO TABLE student_regex_serde_old;

SELECT * FROM student_regex_serde_old WHERE year = 1;

--------------------------------------------------------------
HOW TO WRITE CUSTOM SERDE QUERIES IN HIVE
--------------------------------------------------------------

ADD JAR /home/deepak/work/hive_inputs/hive-examples.jar;

DROP TABLE student_multiply_serde;

CREATE TABLE IF NOT EXISTS student_multiply_serde


( name string, id int , course string, year int )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnMultiplySerDe'
WITH SERDEPROPERTIES ("number.multiply" = "3", "text.multiply" = "2")
;

DESCRIBE FORMATTED student_multiply_serde;

INSERT OVERWRITE TABLE student_multiply_serde SELECT * FROM student WHERE year = 1;

SELECT * FROM student WHERE year = 1;

SELECT * FROM student_multiply_serde WHERE year = 1;

--------------------------------------------------------------

DROP TABLE student_map_serde;

CREATE TABLE IF NOT EXISTS student_map_serde


( name string, id string , course string, year string )
ROW FORMAT SERDE 'com.deepak.deepak.hadoop.training.hive.SerDe.ColumnarMapSerDe'
;

INSERT OVERWRITE TABLE student_map_serde SELECT * FROM student WHERE year = 1;

SELECT * FROM student_map_serde WHERE year = 1;

--------------------------------------------------------------
Regex Serde for apache log
--------------------------------------------------------------
CREATE TABLE apache_log (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|
[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/apache_clf.txt' OVERWRITE INTO


TABLE apache_log;

--------------------------------------------------------------
DIFFERENT WAYS TO STORE DATA USING HIVE QUERIES
--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_text


( name string, id int , course string, year int ) STORED AS TEXTFILE;

or

CREATE TABLE IF NOT EXISTS student_text


( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_seq


( name string, id int , course string, year int ) STORED AS SEQUENCEFILE;

or

CREATE TABLE IF NOT EXISTS student_seq


( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_rc


( name string, id int , course string, year int ) STORED AS RCFILE;

or
CREATE TABLE IF NOT EXISTS student_rc
( name string, id int , course string, year int )
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_orc


( name string, id int , course string, year int ) STORED AS ORC;

or

CREATE TABLE IF NOT EXISTS student_orc


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_avro


( name string, id int , course string, year int ) STORED AS AVRO;

or

CREATE TABLE IF NOT EXISTS student_avro


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat';

--------------------------------------------------------------

CREATE TABLE IF NOT EXISTS student_parquet


( name string, id int , course string, year int ) STORED AS PARQUET;

or

CREATE TABLE IF NOT EXISTS student_parquet


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';

--------------------------------------------------------------

FROM STUDENT
INSERT OVERWRITE TABLE student_text SELECT *
INSERT OVERWRITE TABLE student_seq SELECT *
INSERT OVERWRITE TABLE student_rc SELECT *
INSERT OVERWRITE TABLE student_orc SELECT *
INSERT OVERWRITE TABLE student_avro SELECT *
INSERT OVERWRITE TABLE student_parquet SELECT *
;
--------------------------------------------------------------
HOW TO WRITE HIVE QUERIES USING TRANSFORM
--------------------------------------------------------------
SELECT TRANSFORM(name, id, year) USING '/bin/cat' AS (name, id, year) FROM student;

SELECT TRANSFORM(name, id, course) USING '/bin/grep cse' AS (name, id, course) FROM
student; .//grep serching the speciffic word

SELECT TRANSFORM(name, id, year, course) USING 'python


/home/deepak/work/hive_inputs/student.py' AS (name, id, year, course) FROM student;

--------------------------------------------------------------
HOW TO WRITE ALTER QUERIES IN HIVE
--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS test ( a string, b int , c int );

CREATE TABLE IF NOT EXISTS test1 LIKE test;

ALTER TABLE test1 RENAME TO mytest;


hive> describe mytest;
a string
b int
c int

ALTER TABLE mytest REPLACE COLUMNS ( x string, y double, a string, b int , c


float);
hive> describe mytest;
x string
y double
a string
b int
c float

ALTER TABLE mytest REPLACE COLUMNS (x string, y double);


hive> describe mytest;
x string
y double

ALTER TABLE mytest ADD COLUMNS (c1 INT COMMENT 'c1 column', c2 STRING COMMENT 'c2
column');
hive> describe mytest;
x string
y double
c1 int c1 column
c2 string c2 column

ALTER TABLE mytest CHANGE c1 c1 STRING COMMENT 'new c1 column';


hive> describe mytest;
x string
y double
c1 string new c1 column
c2 string c2 column

ALTER TABLE mytest CHANGE c1 c11 STRING COMMENT 'new c11 column';
hive> describe mytest;
x string
y double
c11 string new c11 column
c2 string c2 column

ALTER TABLE mytest CHANGE c2 c21 STRING COMMENT 'new c21 column' after x;
hive> describe mytest;
x string
c21 string new c21 column
y double
c11 string new c11 column

ALTER TABLE mytest CHANGE c21 c2 STRING COMMENT 'new c2 column' FIRST;
hive> describe mytest;
c2 string new c2 column
x string
y double
c11 string new c11 column

ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'add comment');

ALTER TABLE mytest SET TBLPROPERTIES ('comment' = 'new comment');

ALTER TABLE mytest SET TBLPROPERTIES ('key1' = 'value1', 'key2' = 'value2');

ALTER TABLE mytest SET SERDEPROPERTIES ('field.delim' = ',');

ALTER TABLE mytest SET SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' WITH


SERDEPROPERTIES ('field.delim' = ':');

ALTER TABLE mytest ADD PARTITION (partCol = 'value1') location 'loc1';


ALTER TABLE mytest ADD PARTITION (partCol = 'value2') location 'loc2';

or

ALTER TABLE mytest ADD


PARTITION (partCol = 'value1') location 'loc1'
PARTITION (partCol = 'value2') location 'loc2';

--------------------------------------------------------------
DIFFERENT WAYS TO RUN HIVE QUERIES
--------------------------------------------------------------

Running Hive CLI


--------------------------------------------------------------
$HIVE_HOME/bin/hive

//diff beel and hive beeli use in cluster jdbc connectivity

Running HiveServer2 and Beeline


--------------------------------------------------------------
$HIVE_HOME/bin/hiveserver2
$HIVE_HOME/bin/beeline -u jdbc:hive2://localhost:10000/default -n deepak -d
org.apache.hive.jdbc.HiveDriver

Go for this more info:


https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients

Running HCatalog
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/hcat_server.sh
$HIVE_HOME/hcatalog/bin/hcat

Running WebHCat
--------------------------------------------------------------
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh start
$HIVE_HOME/hcatalog/sbin/webhcat_server.sh stop

http://localhost:50111/templeton/v1

http://localhost:50111/templeton/v1/ddl/database/default/table/student?
user.name=hadoop

Role Management Commands //authoriazation


--------------------------------------------------------------
CREATE ROLE deepak;

DROP ROLE deepak;

SHOW ROLES;

SHOW CURRENT ROLES;

--------------------------------------------------------------
ACID functionality in HIVE
--------------------------------------------------------------
Add the below properties in "hive-site.xml" file & restart hive server

<property>
<name>hive.in.test</name>
<value>false</value>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>

<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>2</value>
</property>

CREATE HIVE TABLE WITH CLUSTERED BY, ORC, TBLPROPERTIES


--------------------------------------------------------------
CREATE TABLE IF NOT EXISTS student_acid
( name string, id int, course string, year int )
CLUSTERED BY (name) INTO 4 BUCKETS
STORED AS ORC
LOCATION '/hive/deepak/student_acid'
TBLPROPERTIES ('transactional' = 'true');

INSERT INTO TABLE student_acid VALUES


('arun', 1, 'mca', 1),
('anil', 2, 'mca', 1),
('sudheer', 3, 'mca', 2),
('santosh', 4, 'mca', 2)
;

UPDATE student_acid
SET year = 3, course = 'mech'
WHERE id = 4 ;

DELETE FROM student_acid WHERE name = 'anil';

==================================================================
LOAD DATA LOCAL INPATH '${env:HOME}/work/hive_inputs/student1.txt' INTO TABLE
student_partition PARTITION(course, year);

CREATE TABLE IF NOT EXISTS student_partition_test ( name string , id int )


PARTITIONED BY ( course string , year int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/hive/deepak/student_partition_test'
;

ALTER TABLE student_partition_test ADD PARTITION (course = 'cse' , year = 1)


location '/hive/deepak/student_partition_test/course=cse/year=1';

==================================================================

use deepak;

add jar /home/deepak/work/deepak-pdf-utils.jar;

drop table student_pdf;

CREATE TABLE IF NOT EXISTS student_pdf


( name string, id int , course string, year int )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ("field.delim" = " ")
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
;

LOAD DATA LOCAL INPATH '/home/deepak/work/input/student.pdf' OVERWRITE INTO TABLE


student_pdf;

select * from student_pdf;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS
INPUTFORMAT 'com.deepak.deepak.hadoop.training.pdf.mapred.deepakPdfInputFormat'
OUTPUTFORMAT
'com.deepak.deepak.hadoop.training.pdf.mapred.deepakHivePdfOutputFormat'
SELECT * FROM student_pdf;

INSERT OVERWRITE LOCAL DIRECTORY '/home/deepak/work/output/hive-op-1'


ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':'
SELECT * FROM student_pdf;

==================================================================

hadoop jar /home/deepak/work/deepak-pdf-utils.jar


com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
file:///home/deepak/work/input/student.pdf file:///home/deepak/work/output/mr-op

hadoop jar /home/deepak/work/deepak-pdf-utils.jar


com.deepak.deepak.hadoop.training.pdf.utils.deepakPdfWordCountJob
-Dmapred.pdfoutputformat.separator=*** file:///home/deepak/work/input/student.pdf
file:///home/deepak/work/output/mr-op-1

==================================================================

You might also like