Chapter 15 : Aggregate Operations

Chapter Learning Objectives

  • Various aggregate operations on data frame.

Chapter Outline

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
from IPython.display import display_html
import pandas as pd 
import numpy as np
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html(index=False)
        html_str+= "\xa0\xa0\xa0"*10
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)
space = "\xa0" * 10
import panel as pn

css = """
div.special_table + table, th, td {
  border: 3px solid orange;
}
"""
pn.extension(raw_css=[css])

1a. DataFrame Aggregations

../_images/15.png

Lets first understand the syntax

Syntax

pyspark.sql.DataFrame.agg(*exprs)

Aggregate on the entire DataFrame without groups (shorthand for df.groupBy.agg()).

‘’’

Input: Spark dataframe

df = spark.createDataFrame([(1,"north",100,"walmart"),(2,"south",300,"apple"),(3,"west",200,"google"),
                            (1,"east",200,"google"),(2,"north",100,"walmart"),(3,"west",300,"apple"),
                            (1,"north",200,"walmart"),(2,"east",500,"google"),(3,"west",400,"apple"),],
                          ["emp_id","region","sales","customer"])
                     
df.show()#show(truncate=False)
+------+------+-----+--------+
|emp_id|region|sales|customer|
+------+------+-----+--------+
|     1| north|  100| walmart|
|     2| south|  300|   apple|
|     3|  west|  200|  google|
|     1|  east|  200|  google|
|     2| north|  100| walmart|
|     3|  west|  300|   apple|
|     1| north|  200| walmart|
|     2|  east|  500|  google|
|     3|  west|  400|   apple|
+------+------+-----+--------+
print(df.sort('customer').toPandas().to_string(index=False))#show()
 emp_id region  sales customer
      2  south    300    apple
      3   west    300    apple
      3   west    400    apple
      3   west    200   google
      1   east    200   google
      2   east    500   google
      1  north    100  walmart
      2  north    100  walmart
      1  north    200  walmart

df.agg({"sales": "sum"}).show()
+----------+
|sum(sales)|
+----------+
|      2300|
+----------+
df.agg({"sales": "min"}).show()
+----------+
|min(sales)|
+----------+
|       100|
+----------+
df.agg({"sales": "max"}).show()
+----------+
|max(sales)|
+----------+
|       500|
+----------+
df.agg({"sales": "count"}).show()
+------------+
|count(sales)|
+------------+
|           9|
+------------+
df.agg({"sales": "mean"}).show()
+------------------+
|        avg(sales)|
+------------------+
|255.55555555555554|
+------------------+
df.agg({"sales": "mean","customer":"count"}).show()
+------------------+---------------+
|        avg(sales)|count(customer)|
+------------------+---------------+
|255.55555555555554|              9|
+------------------+---------------+

1b. DataFrame Aggregations

../_images/25.png
df.groupby("emp_id").agg({"sales": "sum"}).orderBy('emp_id').toPandas()#show()
emp_id sum(sales)
0 1 500
1 2 900
2 3 900
df.groupby("emp_id").agg({"sales": "max"}).orderBy('emp_id').toPandas()
emp_id max(sales)
0 1 200
1 2 500
2 3 400
df.groupby("emp_id").agg({"sales": "last"}).orderBy('emp_id').toPandas()
emp_id last(sales)
0 1 200
1 2 500
2 3 400
df.groupby("region").agg({"sales": "sum"}).orderBy('region').show()
+------+----------+
|region|sum(sales)|
+------+----------+
|  east|       700|
| north|       400|
| south|       300|
|  west|       900|
+------+----------+
df.groupby("customer").agg({"sales": "sum"}).orderBy('customer').show()
+--------+----------+
|customer|sum(sales)|
+--------+----------+
|   apple|      1000|
|  google|       900|
| walmart|       400|
+--------+----------+