# Import data from a MySQL database into HDFS using Sqoop
#1
- Importing orders table data from retail_db into HDFS
#
- textfile format with default delimiter, default mapper
#
- even if we dont specify target-dir, the output will be written to defa
ult hdfs folder
#
- when we use --query or -e switch, we must specify target-dir
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile
#2
- Importing orders table data from retail_db into HDFS
#
- textfile format with default delimiter, default mapper
#
- Clean up target dir if it exists
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile \
--delete-target-dir
#3
- Importing orders table data from retail_db into HDFS
#
- textfile format with only one mapper (so, total files will be 1)
#
- Clean up target dir if it exists
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile \
--delete-target-dir \
--m 1 \
--fields-terminated-by "|" \
--lines-terminated-by "\n"
#4
- Importing orders table data from retail_db into HDFS
#
- textfile format with only one mapper (so, total files will be 1)
#
- Conditional Import
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile \
--delete-target-dir \
--m 1 \
--fields-terminated-by "|" \
--lines-terminated-by "\n" \
--where "order_id < 11"
#5
#
#
- Importing orders table data from retail_db into HDFS with append
- textfile format with only one mapper (so, total files will be 1)
- Conditional Import
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile \
--append \
--m 1 \
--fields-terminated-by "|" \
--lines-terminated-by "\n" \
--where "order_id > 10 and order_id < 101"
#6
- Importing orders table data from retail_db into HDFS
#
- textfile format with only one mapper (so, total files will be 1)
#
- Clean up target dir if it exists
#
- Only specific columns (order_id, order_date, order_status)
#
- When we use --columns switch, there shouldn't be any white-space char
between columns
sqoop import \
--connect="jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders \
--target-dir /user/cloudera/orders \
--as-textfile \
--delete-target-dir \
--m 1 \
--fields-terminated-by "|" \
--lines-terminated-by "\n" \
--columns order_id,order_date,order_status
#7
- Importing orders table data from retail_db into HDFS
#
- textfile format with freeform query
#
- when using freeform query, we must specify target-dir
#
- and also, we must specify split-by switch to specify column provided i
f we dont pass --m 1
sqoop import \
--connect jdbc:mysql://[Link]/retail_db \
--username retail_dba \
--password cloudera \
--delete-target-dir \
--as-textfile \
--target-dir=/user/cloudera/orders \
--query "select * from orders where \$CONDITIONS" \
--split-by order_customer_id
#8
- Importing orders table data from retail_db into HDFS
#
- sequencefile
#
- when using freeform query, we must specify target-dir
#
- and also, we must specify split-by switch to specify column provided i
f we dont pass --m 1
sqoop import \
--connect jdbc:mysql://[Link]/retail_db \
--username retail_dba \
--password cloudera \
--delete-target-dir \
--as-sequencefile \
--target-dir=/user/cloudera/orders \
--query "select * from orders where \$CONDITIONS" \
--split-by order_customer_id
#9
- Importing orders table data from retail_db into HDFS
#
- avrodatafile (This option will create .avsc file (AVRO Schema file) in
the local working folder
#
- when using freeform query, we must specify target-dir
#
- and also, we must specify split-by switch to specify column provided i
f we dont pass --m 1
sqoop import \
--connect jdbc:mysql://[Link]/retail_db \
--username retail_dba \
--password cloudera \
--delete-target-dir \
--as-avrodatafile \
--target-dir=/user/cloudera/orders \
--query "select * from orders where \$CONDITIONS" \
--split-by order_customer_id
#10
- Importing all tables from retail_db into HDFS
#
- as textfile
sqoop import-all-tables \
--connect "jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--as-textfile
################################################################################
###########################
# Export data to a MySQL database from HDFS using Sqoop
#1
- Exporting orders data from HDFS to MySQL table orders_export_test
create table orders_export_test select * from orders where 1=2;
sqoop export \
--connect "jdbc:mysql://[Link]/retail_db" \
--username retail_dba \
--password cloudera \
--table orders_export_test \
--export-dir /user/cloudera/orders
################################################################################
###########################
# Change the delimiter and file format of data during import using Sqoop
################################################################################
###########################
# Ingest real-time and near-real time (NRT) streaming data into HDFS using Flume
#1
- Ingest real-time data into HDFS
#
- Below is the flume conf file to read the data realtime
#
- It will read the result of "tail" command and ingest into HDFS
#
- Save this below conf into [Link] file under your local f
older
# Name the components on this agent
[Link] = r1
[Link] = k1
[Link] = c1
# Describe/configure the source
[Link] = exec
[Link] = tail -F /opt/gen_logs/logs/[Link]
# Describe the sink
[Link] = hdfs
[Link] = hdfs://[Link]/user/cloudera/flume/%y-%m-%d
[Link] = log
[Link] = DataStream
[Link] = true
# Use a channel which buffers events in memory
[Link] = memory
[Link] = 1000
[Link] = 100
# Bind the source and sink to the channel
[Link] = c1
[Link] = c1
#
- To start populating logs, we can use the following command
startlogs
#
- To start the flume agent
flume-ng agent --conf /home/cloudera --conf-file /home/cloudera/flume-exec-test.
conf --name a1
#2
#
#
#
folder
Ingest near real-time data into HDFS
Below is the flume conf file to read the data near-realtime
It will read the telnet inputs and ingest into HDFS
Save this below conf into [Link] file under your local
# Name the components on this agent
[Link] = r1
[Link] = k1
[Link] = c1
# Describe/configure
[Link] =
[Link] =
[Link] =
the source
netcat
localhost
44444
# Describe the sink
[Link] = hdfs
[Link] = hdfs://[Link]/user/cloudera/flume
[Link] = netcat
[Link] = DataStream
# Use a channel which buffers events in memory
[Link] = memory
[Link] = 1000
[Link] = 100
# Bind the source and sink to the channel
[Link] = c1
[Link] = c1
#
- To start populating input, we need to launch telnet on localhost:44444
telnet localhost 44444
#
- To start the flume agent
flume-ng agent --conf /home/cloudera --conf-file /home/cloudera/flume-netcat-tes
[Link] --name a1
################################################################################
###########################
# Load data into and out of HDFS using the Hadoop File System (FS) commands
################################################################################
###########################