home

GroupBy

import pandas
import numpy

rng = numpy.random.RandomState(0)
df = pandas.DataFrame(
    {
        'key': ['A', 'B', 'C', 'A', 'B', 'C'],
        'data1': range(6),
        'data2': rng.randint(0, 10, 6)
    }
)

print(df)
#   key  data1  data2
# 0   A      0      5
# 1   B      1      0
# 2   C      2      3
# 3   A      3      3
# 4   B      4      7
# 5   C      5      9

grpby = df.groupby("key")

print(grpby.groups)
# {
#   'A': Int64Index([0, 3], dtype='int64'),
#   'B': Int64Index([1, 4], dtype='int64'),
#   'C': Int64Index([2, 5], dtype='int64')
# }

print(grpby.get_group("A"))
#   key  data1  data2
# 0   A      0      5
# 3   A      3      3

print(type(grpby.get_group("A")))
# <class 'pandas.core.frame.DataFrame'>

Aggregation

Process of taking multiple values and returning a single value, such as finding the maximum, the minimum or the mean.

import pandas
import numpy

rng = numpy.random.RandomState(0)
df = pandas.DataFrame(
    {
        'key': ['A', 'B', 'C', 'A', 'B', 'C'],
        'data1': range(6),
        'data2': rng.randint(0, 10, 6)
    },
    columns=['key', 'data1', 'data2']
)

print(df)
#   key  data1  data2
# 0   A      0      5
# 1   B      1      0
# 2   C      2      3
# 3   A      3      3
# 4   B      4      7
# 5   C      5      9

gp = df.groupby("key")
print(gp.aggregate(["median", "sum"]))
#      data1      data2    
#     median sum median sum
# key                      
# A      1.5   3    4.0   8
# B      2.5   5    3.5   7
# C      3.5   7    6.0  12

.describe()

Computes several common aggregates for all numerical columns a DataFrame.

import pandas
import numpy

rng = numpy.random.RandomState(0)
df = pandas.DataFrame(
    {
        'key': ['A', 'B', 'C', 'A', 'B', 'C'],
        'data1': range(6),
        'data2': rng.randint(0, 10, 6)
    },
    columns=['key', 'data1', 'data2']
)

print(df.describe())
#           data1     data2
# count  6.000000  6.000000
# mean   2.500000  4.500000
# std    1.870829  3.209361
# min    0.000000  0.000000
# 25%    1.250000  3.000000
# 50%    2.500000  4.000000
# 75%    3.750000  6.500000
# max    5.000000  9.000000

Method Dispatching

Any method, not explicitly implemented by a GroupBy object, will be applied on the Groups.

import pandas as pd

pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000, 231000, 557000]
    }, index=["İzmir", "Manisa", "Aydın", "Denizli"],
)

akdeniz = pd.DataFrame(
    {
        "District": "Akdeniz",
        "Population": [1730000, 479000, 1230000, 75500]
    }, index=["Adana", "Osmaniye", "Antalya", "Burdur"]
)

turkey = marmara.append(ege).append(akdeniz)

# .describe() called on GroupBy object. Had we more columns, they would have been included.
print(turkey.groupby("District").describe())
#          Population                                                                              
#               count       mean           std       min       25%       50%        75%         max
# District                                                                                         
# Akdeniz         4.0   878625.0  7.422968e+05   75500.0  378125.0  854500.0  1355000.0   1730000.0
# Ege             4.0  1353500.0  1.948965e+06  231000.0  324750.0  456500.0  1485250.0   4270000.0
# Marmara         4.0  3917750.0  7.389377e+06   89000.0  154250.0  291000.0  4054500.0  15000000.0

# .describe() applied after Column Indexing on GroupBy object.
print(turkey.groupby("District")["Population"].describe())
#           count       mean           std       min       25%       50%        75%         max
# District                                                                                     
# Akdeniz     4.0   878625.0  7.422968e+05   75500.0  378125.0  854500.0  1355000.0   1730000.0
# Ege         4.0  1353500.0  1.948965e+06  231000.0  324750.0  456500.0  1485250.0   4270000.0
# Marmara     4.0  3917750.0  7.389377e+06   89000.0  154250.0  291000.0  4054500.0  15000000.0

Filter

import pandas as pd

pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [176000, 15000000]
    }, index=["Tekirdağ", "İstanbul"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000]
    }, index=["İzmir", "Manisa"],
)

akdeniz = pd.DataFrame(
    {
        "District": "Akdeniz",
        "Population": [1730000, 1230000]
    }, index=["Adana", "Antalya"]
)

turkey = marmara.append(ege).append(akdeniz)
grouped_by_district = turkey.groupby("District")

print("Total Population by District")
print(grouped_by_district["Population"].sum())
# Total Population by District
# District
# Akdeniz     2960000
# Ege         4626000
# Marmara    15176000

print(grouped_by_district["Population"].sum() > 5000000)
# District
# Akdeniz    False
# Ege        False
# Marmara     True
# Name: Population, dtype: bool

print(grouped_by_district["Population"].sum()[grouped_by_district["Population"].sum() > 5000000])
# District
# Marmara    15176000
# Name: Population, dtype: int64

def filter_by_population(df):
    return df["Population"].sum() > 5000000

print("DataFrame filtered by Total Population By District")
print(grouped_by_district.filter(filter_by_population))
# DataFrame filtered by Total Population By District
#          District  Population
# Tekirdağ  Marmara      176000
# İstanbul  Marmara    15000000


print("Total Population by District -(DataFrame filtered by Total Population by District)")
print(grouped_by_district.filter(filter_by_population).groupby("District")["Population"].sum())
# Total Population by District -(DataFrame filtered by Total Population by District)
# District
# Marmara    15176000
# Name: Population, dtype: int64

Filtering Examples

Allows you to drop data based on the group properties.

import pandas
import numpy

rng = numpy.random.RandomState(0)
df = pandas.DataFrame(
    {
        'key': ['A', 'B', 'C', 'A', 'B', 'C'],
        'data1': range(6),
        'data2': rng.randint(0, 10, 6)
    },
    columns=['key', 'data1', 'data2']
)

print(df)
#   key  data1  data2
# 0   A      0      5
# 1   B      1      0
# 2   C      2      3
# 3   A      3      3
# 4   B      4      7
# 5   C      5      9

# Filter method must return a boolean result.
def do_filter(x):
    return x["data1"].sum() < 4

print(df.groupby("key").filter(do_filter))
#   key  data1  data2
# 0   A      0      5
# 3   A      3      3

Examples

Multi GroupBy and Finding the mean value

import numpy as np
import pandas as pd

np.random.seed(1)

df = pd.DataFrame(
    {
        "key1":["a", "a", "b", "b", "a"],
        "key2":["one", "two", "one", "two", "one"],
        "data1":np.random.rand(5),
        "data2":np.random.rand(5)
    }
)

print(df)
#   key1 key2     data1     data2
# 0    a  one  0.417022  0.092339
# 1    a  two  0.720324  0.186260
# 2    b  one  0.000114  0.345561
# 3    b  two  0.302333  0.396767
# 4    a  one  0.146756  0.538817

# Compute the mean of the data1 column using the labels from key1.
# w/o GroupBy
print(df[df["key1"] == "a"]["data1"].mean())
# 0.428034

print(df[df["key1"] == "b"]["data1"].mean())
# 0.151223

# w/ GroupBy
print(df.groupby("key1")["data1"].mean())
# key1
# a    0.428034
# b    0.151223

print(df.groupby(["key1", "key2"])["data1"].mean())
# key1  key2
# a     one     0.281889
#       two     0.720324
# b     one     0.000114
#       two     0.302333

Finding Unique and Number of Unique Values

import pandas as pd

# col_1,col_2
# a,ab
# a,ab
# a,ab
# a,ab
# b,ba
# b,bb
# b,bc

df = pd.read_csv('data.csv')

print(df.groupby('col_1').count())
#        col_2
# col_1
# a          4
# b          3

print(df.groupby('col_1')['col_2'].unique())
# a            [ab]
# b    [ba, bb, bc]

print(df.groupby('col_1')['col_2'].nunique())
# col_1
# a    1
# b    3

print(df.groupby('col_1')['col_2'].nunique().idxmax())
# b

__

References & Further Reading