unzip [Link].
zip -d /home/Hadoop/
unzip [Link] -d /home/Hadoop/
hadoop@hp:~$ touch max_temp_mapper.py
hadoop@hp:~$ vi max_temp_mapper.py
hadoop@hp:~$ touch max_temp_reducer.py
hadoop@hp:~$ vi max_temp_reducer.py
hadoop@hp:~$ chmod +x max_temp_mapper.py max_temp_reducer.py
hadoop@hp:~$ realpath max_temp_mapper.py
/home/hadoop/max_temp_mapper.py
hadoop@hp:~$ realpath max_temp_reducer.py
/home/hadoop/max_temp_reducer.py
hadoop@hp:~$ hdfs dfs -put [Link] /user/hadoop/
hadoop@hp:~$ hdfs dfs -put [Link] /user/hadoop/
hadoop@hp:~$ hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar
\
-input /user/hadoop/[Link] \
-input /user/hadoop/[Link] \
-output /user/hadoop/max_temp_output \
-mapper /home/hadoop/max_temp_mapper.py \
-reducer /home/hadoop/max_temp_reducer.py
hdfs dfs -cat /user/hadoop/max_temp_output/part-00000 (delete previous output
folder: hadoop fs -rm -r /user/hadoop/max_temp_output)
max_temp_reducer.py
#!/usr/bin/env python3
import sys
current_date = None
max_temperature = float('-inf')
for line in [Link]:
# Strip any leading/trailing whitespace
line = [Link]()
# Split the line into key and value
date, temperature = [Link]('\t')
try:
# Convert temperature to float
temperature = float(temperature)
except ValueError:
# Handle the case where the temperature is not a valid float
continue
# Check if we are still processing the same date
if current_date == date:
# Update the maximum temperature for this date
if temperature > max_temperature:
max_temperature = temperature
else:
# If we have moved to a new date, output the result for the previous date
if current_date is not None:
print(f"{current_date}\t{max_temperature}")
# Start processing the new date
current_date = date
max_temperature = temperature
# Output the result for the last date
if current_date is not None:
print(f"{current_date}\t{max_temperature}")
max_temp_mapper.py
#!/usr/bin/env python3
import sys
for line in [Link]:
# Strip any leading/trailing whitespace
line = [Link]()
# Split the line into columns based on comma
parts = [Link](',')
# Check if the line has the expected number of columns
if len(parts) >= 3:
try:
# Extract the date and temperature
date = parts[1]
temperature = float(parts[3])
# Output the date and temperature as key-value pair
print(f"{date}\t{temperature}")
except ValueError:
# Handle the case where the temperature is not a valid float
continue