Data
Engineering 101
SQL and
PySpark
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
SELECT ALL COLUMNS
SQL
SELECT * FROM table;
PYSPARK
[Link]("*")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
SELECT SPECIFIC COLUMNS
SQL
SELECT col1, col2 FROM table;
PYSPARK
[Link]("col1", "col2")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
FILTERING ROWS (WHERE CLAUSE)
SQL
SELECT * FROM table WHERE condition;
PYSPARK
[Link]("condition")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
ORDERING ROWS
SQL
SELECT * FROM table ORDER BY col1;
PYSPARK
[Link]("col1")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
ORDERING ROWS DESCENDING
SQL
SELECT * FROM table ORDER BY col1 DESC;
PYSPARK
[Link]([Link]())
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
LIMITING ROWS
SQL
SELECT * FROM table LIMIT 10;
PYSPARK
[Link](10)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
SELECTING DISTINCT VALUES
SQL
SELECT DISTINCT col1 FROM table;
PYSPARK
[Link]("col1").distinct()
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
GROUP BY AND AGGREGATE
(COUNT)
SQL
SELECT col1, COUNT(*) FROM table GROUP
BY col1;
PYSPARK
[Link]("col1").count()
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
GROUP BY AND AGGREGATE (AVG)
SQL
SELECT col1, AVG(col2) FROM table GROUP
BY col1;
PYSPARK
[Link]("col1").avg("col2")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
INNER JOIN
SQL
SELECT *
FROM table1
INNER JOIN table2
ON [Link] = [Link];
PYSPARK
[Link](df2, [Link] == [Link], "inner")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
LEFT JOIN
SQL
SELECT * FROM table1
LEFT JOIN table2 ON [Link] = [Link];
PYSPARK
[Link](df2, [Link] == [Link], "left")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
RIGHT JOIN
SQL
SELECT * FROM table1
RIGHT JOIN table2
ON [Link] = [Link];
PYSPARK
[Link](df2, [Link] == [Link], "right")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
FULL OUTER JOIN
SQL
SELECT * FROM table1
FULL JOIN table2
ON [Link] = [Link];
PYSPARK
[Link](df2, [Link] == [Link], "outer")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
SUBQUERIES
SQL
SELECT * FROM (SELECT col1, col2 FROM
table) sub_table;
PYSPARK
sub_df = [Link]("col1", "col2")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CASE STATEMENTS
SQL
SELECT col1,
CASE WHEN condition
THEN result ELSE result2 END
FROM table;
PYSPARK
[Link]("col1", when(condition,
result).otherwise(result2).alias("new_col"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
WINDOW FUNCTIONS (ROW NUMBER)
SQL
SELECT col1,
ROW_NUMBER() OVER (ORDER BY col2) AS row_num
FROM table;
PYSPARK
[Link]("row_num",
row_number().over([Link]("col2")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
WINDOW FUNCTIONS (AGGREGATIONS)
SQL
SELECT col1,
SUM(col2) OVER (PARTITION BY col3) AS sum_col2
FROM table;
PYSPARK
[Link]("sum_col2", sum("col2") \
.over([Link]("col3")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
LAG FUNCTION
SQL
SELECT col1,
LAG(col2, 1) OVER (ORDER BY col3)
AS lag_col2 FROM table;
PYSPARK
[Link]("lag_col2", lag("col2", 1) \
.over([Link]("col3")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
LEAD FUNCTION
SQL
SELECT col1,
LEAD(col2, 1) OVER (ORDER BY col3) AS lead_col2
FROM table;
PYSPARK
[Link]("lead_col2", lead("col2", 1) \
.over([Link]("col3")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
HANDLING NULLS (IS NULL)
SQL
SELECT * FROM table WHERE col1 IS NULL;
PYSPARK
[Link]([Link]())
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
HANDLING NULLS (IS NOT NULL)
SQL
SELECT * FROM table WHERE col1
IS NOT NULL;
PYSPARK
[Link]([Link]())
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
UNION OF TWO TABLES
SQL
SELECT * FROM table1
UNION
SELECT * FROM table2;
PYSPARK
[Link](df2)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
INTERSECT OF TWO TABLES
SQL
SELECT * FROM table1
INTERSECT
SELECT * FROM table2;
PYSPARK
[Link](df2)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
EXCEPT (DIFFERENCE) OF TWO TABLES
SQL
SELECT * FROM table1
EXCEPT
SELECT * FROM table2;
PYSPARK
[Link](df2)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CREATING TEMPORARY VIEW
SQL
CREATE TEMP VIEW temp_table
AS SELECT * FROM table;
PYSPARK
[Link]("temp_table")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
USING SQL QUERIES ON DATAFRAMES
SQL
SELECT * FROM temp_table;
PYSPARK
[Link]("SELECT * FROM temp_table")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
PIVOTING DATA
SQL
SELECT * FROM
(SELECT col1, col2 FROM table)
PIVOT (SUM(col2) FOR col1 IN ('value1', 'value2'));
PYSPARK
[Link]()\
.pivot("col1", ['value1', 'value2']).sum("col2")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
UPDATING ROWS
SQL
UPDATE table SET col1 = value WHERE
condition;
PYSPARK
df = [Link]("col1", when(condition,
value).otherwise(df.col1))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
DELETING ROWS
SQL
DELETE FROM table WHERE condition;
PYSPARK
df = [Link](~condition)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
HANDLING DUPLICATES
SQL
SELECT col1, COUNT(*) FROM table
GROUP BY col1 HAVING COUNT(*) > 1;
PYSPARK
[Link]("col1").count().filter("count > 1")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CALCULATING PERCENTAGE
SQL
SELECT col1, (col2 / col3) * 100 AS percentage
FROM table;
PYSPARK
[Link]("percentage", (df.col2 /
df.col3) * 100)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
STRING FUNCTIONS (CONCATENATION)
SQL
SELECT CONCAT(col1, col2) AS new_col
FROM table;
PYSPARK
[Link](concat("col1", "col2") \
.alias("new_col"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
DATE FUNCTIONS (CURRENT DATE)
SQL
SELECT CURRENT_DATE AS today;
PYSPARK
[Link](current_date().alias("today"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
EXTRACTING YEAR FROM DATE
SQL
SELECT EXTRACT(YEAR FROM date_col) AS year
FROM table;
PYSPARK
[Link](year("date_col").alias("year"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CONDITIONAL AGGREGATION
SQL
SELECT SUM(CASE WHEN condition THEN col1 ELSE 0 END)
FROM table;
PYSPARK
[Link](sum(when(condition, df.col1) \
.otherwise(0)))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
RENAMING COLUMNS
SQL
SELECT col1 AS new_col1 FROM table;
PYSPARK
[Link]("col1", "new_col1")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
DROPPING COLUMNS
SQL
ALTER TABLE table DROP COLUMN col1;
PYSPARK
[Link]("col1")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
ADDING NEW COLUMNS
SQL
ALTER TABLE table
ADD COLUMN new_col data_type;
PYSPARK
[Link]("new_col", expression)
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
REPLACING VALUES
SQL
UPDATE table SET col1 = new_value
WHERE condition;
PYSPARK
[Link]("col1", when(condition,
new_value).otherwise(df.col1))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
USING UDFS (USER-DEFINED FUNCTIONS)
SQL
CREATE FUNCTION my_udf AS ...;
SELECT my_udf(col1) FROM table;
PYSPARK
[Link](my_udf("col1"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
EXPLODING ARRAYS
SQL
SELECT col1, EXPLODE(array_col) AS exploded_col
FROM table;
PYSPARK
[Link]("col1", explode("array_col") \
.alias("exploded_col"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
FLATTENING NESTED DATA
SQL
SELECT col1, nested_col.* FROM table;
PYSPARK
[Link]("col1", "nested_col.*")
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CASTING DATA TYPES
SQL
SELECT CAST(col1 AS data_type)
FROM table;
PYSPARK
[Link]([Link]("data_type"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
HANDLING JSON DATA
SQL
SELECT JSON_VALUE(json_col, '$.key')
FROM table;
PYSPARK
[Link](get_json_object("json_col", "$.key"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
GROUPING SETS
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY GROUPING SETS ((col1), (col2));
PYSPARK
[Link]("col1",
"col2").agg(sum("col3"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
ROLLUP
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY ROLLUP(col1, col2);
PYSPARK
[Link]("col1", "col2") \
.agg(sum("col3"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CUBE
SQL
SELECT col1, col2, SUM(col3)
FROM table GROUP BY CUBE(col1, col2);
PYSPARK
[Link]("col1", "col2") \
.agg(sum("col3"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
RANK FUNCTION
SQL
SELECT col1,
RANK() OVER (ORDER BY col2) AS rank
FROM table;
PYSPARK
[Link]("rank", rank() \
.over([Link]("col2")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
DENSE RANK FUNCTION
SQL
SELECT col1,
DENSE_RANK() OVER (ORDER BY col2) AS dense_rank
FROM table;
PYSPARK
[Link]("dense_rank",
dense_rank().over([Link]("col2")))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
CUMULATIVE SUM (RUNNING TOTAL)
SQL
SELECT col1,
SUM(col2) OVER (ORDER BY col1) AS running_total
FROM table;
PYSPARK
[Link]("running_total", sum("col2")\
.over([Link]("col1") \
.rowsBetween([Link],
[Link])))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
HANDLING DATES (DATE DIFFERENCE)
SQL
SELECT DATEDIFF(date1, date2) FROM table;
PYSPARK
[Link](datediff("date1", "date2"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
STRING FUNCTIONS (SUBSTRING)
SQL
SELECT SUBSTRING(col1, start, length)
FROM table;
PYSPARK
[Link](substring("col1", start, length))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
UPPER AND LOWER CASE CONVERSION
SQL
SELECT UPPER(col1), LOWER(col2)
FROM table;
PYSPARK
[Link](upper("col1"), lower("col2"))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
FILTER WITH IN CLAUSE
SQL
SELECT * FROM table
WHERE col1 IN (value1, value2);
PYSPARK
[Link]([Link](value1, value2))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
FILTER WITH BETWEEN CLAUSE
SQL
SELECT * FROM table
WHERE col1 BETWEEN value1 AND value2;
PYSPARK
[Link]([Link](value1, value2))
Shwetank Singh
GritSetGrow - [Link]
Data Engineering 101: SQL and PySpark
ORDER BY MULTIPLE COLUMNS
SQL
SELECT * FROM table
ORDER BY col1, col2 DESC;
PYSPARK
[Link]("col1", [Link]())
Shwetank Singh
GritSetGrow - [Link]