create table records (year string, temperature int, quality int) row format
delimited fields TERMINATED by '\t';
load data local inpath '/home/cloudera/Desktop/[Link]'
OVERWRITE INTO TABLE records;
show tables ;
describe records ;
select * from records ;
select distinct year from records ;
CREATE TABLE stations (usaf STRING, wban STRING, name STRING) ROW
FORMAT SERDE '[Link]' WITH
SERDEPROPERTIES ("[Link]" = "(\\d{6})\\s(\\d{5})\\s(.{29}).*");
load data local inpath '/home/cloudera/Desktop/[Link]'
OVERWRITE INTO TABLE stations;
select * from stations where name like 'PARIS%' ;
Drop table records ;
create external table records (year string, temperature int, quality int) row
format delimited fields TERMINATED by '/t' location '/user/cloudera/records'
load data local inpath '/home/cloudera/Desktop/[Link]'
OVERWRITE INTO TABLE records;
describe formatted records ;
create table records2 (station string, year string, temperature int, quality int)
row format serde '[Link]' with
SERDEPROPERTIES ("[Link]"=".{4}(.{6}).{5}(.{4}).{68}\\+?(-?\\d{4})(\\d).*")
location '/user/cloudera/records2';
Code python
import re
import sys
for line in [Link]:
(year, temp, q) = [Link]().split()
if(temp != "9999" and [Link]("[01459]",q)):
print ("%s\t%s" % (year, temp))
add file /home/cloudera/Desktop/is_good_quality.py;
From records SELECT TRANSFORM(year, temperature, quality) USING
'is_good_quality' AS year, temperature;
SELECT year, max(temperature) FROM records2 WHERE temperature != 9999
AND quality IN (0, 1, 4, 5, 9) GROUP BY year;
SELECT year, MAX(temperature) FROM records2 WHERE temperature != 9999
AND quality IN (0, 1, 4, 5, 9) GROUP BY year;
SELECT [Link], MAX(CAST([Link] AS INT))FROM (FROM records2
SELECT TRANSFORM(year, temperature, quality) USING 'is_good_quality' AS
year,temperature) rec GROUP BY [Link];
SELECT year, AVG(temperature) FROM records2 WHERE temperature != 9999
AND quality IN (0, 1, 4, 5, 9) GROUP BY year;
SELECT AVG(max_temperature) FROM (SELECT station, year,
MAX(temperature) AS max_temperature FROM records2 WHERE
temperature != 9999 AND quality IN (0, 1, 4, 5, 9) GROUP BY station, year) mt;
SELECT FLOOR(avg_temperature/50), COUNT(station) FROM (SELECT station,
AVG(temperature) AS avg_temperature FROM records2 WHERE temperature !
= 9999 AND quality IN (0, 1, 4, 5, 9) GROUP BY station) mt GROUP BY
FLOOR(avg_temperature/50);
SELECT DISTINCT name FROM stations JOIN records2 ON ([Link] =
[Link]) WHERE year = '1901' OR year = '1902';
SELECT AVG(temperature) FROM stations JOIN records2 ON ([Link] =
[Link]) WHERE name LIKE 'UT0%' OR name LIKE 'TURKU%';
SELECT name, AVG(temperature) FROM stations JOIN records2 ON
([Link] = [Link]) WHERE name LIKE 'UT0%' OR name LIKE
'TURKU%' GROUP BY name;