from pyspark.
sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder.appName("LabExam").getOrCreate()
# Read the CSV file into a DataFrame
df = spark.read.csv("data.csv", header=True)
# Filter rows where no_of_files is greater than 100
filtered_df = df.filter(df["no_of_files"] > 100)
# Show the filtered results
filtered_df.show()
................
from pyspark.sql import SparkSession, functions as F
# Create a SparkSession
spark = SparkSession.builder.appName("LabExam").getOrCreate()
# Read the CSV file into a DataFrame
df = spark.read.csv("data.csv", header=True)
# Convert all values in columns to separate rows
exploded_df = df.select(F.explode(F.array(*df.columns)).alias("column_value"))
# Generate hash of the airlines column
hashed_df = exploded_df.withColumn("hash_value", F.hash(F.col("column_value")))
# Format the output as required
formatted_df = hashed_df.select(F.concat(F.col("hash_value"), F.lit(", "),
F.col("column_value")).alias("output"))
# Show the formatted results
formatted_df.show()