DataFrame

Overview

DataFrame Creation

DataFrame from ndarray

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.random.rand(2, 2), 
    columns=['a', 'b'], 
    index=['x', 'q']
)

print(df)
#           a         b
# x  0.961636  0.223901
# q  0.564983  0.446227

DataFrame from Python Dictionary

import pandas as pd

dataFrame = pd.DataFrame(
    {
        "VIN": [83596, 98567, 14690],
        "MAKE": ["MAZDA", "VW", "KIA"],
        "STOCK": [True, True, False]
    }
)

print(dataFrame)
#      VIN   MAKE  STOCK
# 0  83596  MAZDA   True
# 1  98567     VW   True
# 2  14690    KIA  False
import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "City": ["Edirne", "Kırklareli"],
        "Population": [406000, 89000]
    }
)

print(marmara)
#   District        City  Population
# 0  Marmara      Edirne      406000
# 1  Marmara  Kırklareli       89000

Explicit Index in DataFrame

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

DataFrame from List of Lists

import pandas as pd

df = pd.DataFrame(
    [
        [83596, "MAZDA", True],
        [98567, "VW", True],
        [14690, "KIA", True]
    ], columns=["VIN", "MAKE", "STOCK"]
)

print(df)
#      VIN   MAKE  STOCK
# 0  83596  MAZDA   True
# 1  98567     VW   True
# 2  14690    KIA   True

Multi-Level Index

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000]
    }, index=["İzmir", "Manisa"],
)

turkey = marmara.append(ege)
print(turkey)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# İzmir           Ege     4270000
# Manisa          Ege      356000

turkey['City Name'] = turkey.index
print(turkey)
#            District  Population   City Name
# Edirne      Marmara      406000      Edirne
# Kırklareli  Marmara       89000  Kırklareli
# İzmir           Ege     4270000       İzmir
# Manisa          Ege      356000      Manisa

# Reset Index to integers, drop the current index.
turkey = turkey.reset_index(drop=True)
print(turkey)
#    District  Population   City Name
# 0   Marmara      406000      Edirne
# 1   Marmara       89000  Kırklareli
# 2       Ege     4270000       İzmir
# 3       Ege      356000      Manisa

# Multi - Level Indexing
turkey = turkey.set_index(['District', 'City Name'])
print(turkey)
#                      Population
# District City Name
# Marmara  Edirne          406000
#          Kırklareli       89000
# Ege      İzmir          4270000
#          Manisa          356000

# Multi - Level Indexed DataFrame Queries
# Query Example 1
print(turkey.loc['Marmara', 'Edirne'])
# Population    406000
# Name: (Marmara, Edirne), dtype: int64

# Query Example 2
print(turkey.loc[[('Marmara', 'Edirne'), ('Ege', 'Manisa')]])
#                     Population
# District City Name            
# Marmara  Edirne         406000
# Ege      Manisa         356000

# Query Example 3
print(turkey.loc['Marmara'])
#             Population
# City Name             
# Edirne          406000
# Kırklareli       89000

DataFrame Modification

Column Modifications

import pandas as pd

turkey = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

turkey["Ratio"] = (turkey["Population"] / turkey["Population"].sum()) * 100
print(turkey)
#            District  Population      Ratio
# Edirne      Marmara      406000   1.650440
# Kırklareli  Marmara       89000   0.361796

Index Modifications

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000

marmara.index = ['Ankara', 'Texas', 'Houston', 'Miami']
print(marmara)
#         District  Population
# Ankara   Marmara      406000
# Texas    Marmara       89000
# Houston  Marmara      176000
# Miami    Marmara    15000000

reindex

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000

marmara = marmara.reindex(["Tekirdağ", "İstanbul", "Edirne", "Kırklareli"])
print(marmara)
#            District  Population
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

# Heads up! This will not work!
marmara = marmara.reindex(['Ankara', 'Texas', 'Houston', 'Miami'])
print(marmara)
#         District  Population
# Ankara       NaN         NaN
# Texas        NaN         NaN
# Houston      NaN         NaN
# Miami        NaN         NaN

using .loc for reindex

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)


marmara = marmara.loc[["Tekirdağ", "İstanbul", "Edirne", "Kırklareli"], ['Population', 'District']]
print(marmara)
#             Population District
# Tekirdağ        176000  Marmara
# İstanbul      15000000  Marmara
# Edirne          406000  Marmara
# Kırklareli       89000  Marmara

Data Selection

import pandas as pd

turkey = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

# Column Selection
print(turkey["Population"])
# Edirne          406000
# Kırklareli       89000
# Tekirdağ        176000
# İstanbul      15000000
# Name: Population, dtype: int64

print(type(turkey["Population"]))
# <class 'pandas.core.series.Series'>

# Row Selection
print(turkey.loc["Edirne"])
# District     Marmara
# Population    406000
# Name: Edirne, dtype: object

print(type(turkey.loc["Edirne"]))
# <class 'pandas.core.series.Series'>

# Masking
print(turkey[turkey["Population"] > 1000000])
#          District  Population
# İstanbul  Marmara    15000000

print(type(turkey[turkey["Population"] > 1000000]))
# <class 'pandas.core.frame.DataFrame'>

# Selecting Multiple Rows
print(turkey.loc[['Edirne', 'Kırklareli']])
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

print(type(turkey.loc[['Edirne', 'Kırklareli']]))
# <class 'pandas.core.frame.DataFrame'>

Analysis on DataFrames

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000, 231000, 557000]
    }, index=["İzmir", "Manisa", "Aydın", "Denizli"],
)

akdeniz = pd.DataFrame(
    {
        "District": "Akdeniz",
        "Population": [1730000, 479000, 1230000, 75500]
    }, index=["Adana", "Osmaniye", "Antalya", "Burdur"]
)

turkey = marmara.append(ege).append(akdeniz)

print(turkey.nunique())
# District       3
# Population    12

print(turkey.nsmallest(2, ["Population"])) # also: nlargest
#            District  Population
# Burdur      Akdeniz       75500
# Kırklareli  Marmara       89000

Mathematical Operations

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

# Initial Data
print(df)
#    a  b  c  d
# 0 -8 -7 -6 -5
# 1 -4 -3 -2 -1
# 2  0  1  2  3
# 3  4  5  6  7

# By default every operation will be repeated for each column.
print(df.cumsum())
#     a   b   c   d
# 0   8   7   6   5
# 1  12  10   8   6
# 2  12  11  10   9
# 3  16  16  16  16

Mathematical Operations Using numpy

Official documentation

import numpy as np
import pandas as pd

rng = np.random.RandomState(0)

df = pd.DataFrame(
    rng.randint(-4, 4, size=16).reshape(4, 4),
    columns=['a', 'b', 'c', 'd'],
    index=['x', 'q', 'y', 'z']
)

print(df)
#    a  b  c  d
# x  0  3  1 -4
# q -1 -1 -1  3
# y -3 -1  1 -2
# z  0  3  2 -4

print(np.abs(df))
#    a  b  c  d
# x  0  3  1  4
# q  1  1  1  3
# y  3  1  1  2
# z  0  3  2  4

print(np.sum(df))
# a   -4
# b    4
# c    3
# d   -7
# dtype: int64

print(np.max(df))
# a    0
# b    3
# c    2
# d    3
# dtype: int64

print(np.max(np.max(df)))
# 3

Operations Using apply

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

print(df)
#    a  b  c  d
# 0 -8 -7 -6 -5
# 1 -4 -3 -2 -1
# 2  0  1  2  3
# 3  4  5  6  7

df = df.apply(lambda x : -x)
print(df)
#    a  b  c  d
# 0  8  7  6  5
# 1  4  3  2  1
# 2  0 -1 -2 -3
# 3 -4 -5 -6 -7

# Applied on each column by default.
df = df.apply(lambda x : min(x))
print(df)
# a   -4
# b   -5
# c   -6
# d   -7
# dtype: int64

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

# Can be switched to be applied on each row by specifying axis = 1.
df = df.apply(lambda x : min(x), axis=1)
print(df)
# 0   -8
# 1   -4
# 2    0
# 3    4
# dtype: int64

Merging DataFrames

import pandas as pd

df_1 = pd.DataFrame()
df_2 = pd.DataFrame()

pd.merge(df_1, df_2, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

One-to-One Join

Perfect Match

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  bar     3

df_3 = df_1.merge(df_2)
print(df_3)
#   Name  Pro1  Pro2
# 0  foo     1     4
# 1  bar     0     3

inner Join

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

df_3 = df_1.merge(df_2) # default is inner!
print(df_3)
#   Name  Pro1  Pro2
# 0  foo     1     4

outer Join

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

df_3 = pd.merge(df_1, df_2, how='outer')
print(df_3)
#   Name  Pro1  Pro2
# 0  foo   1.0   4.0
# 1  bar   0.0   NaN
# 2  baz   NaN   3.0

left and right Joins

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

print(pd.merge(df_1, df_2, how='left'))
#   Name  Pro1  Pro2
# 0  foo     1   4.0
# 1  bar     0   NaN

print(pd.merge(df_1, df_2, how='right'))
#   Name  Pro1  Pro2
# 0  foo   1.0     4
# 1  baz   NaN     3

Many-to-One Join

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'foo', 'bar', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(4)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  foo     0
# 2  bar     4
# 3  bar     3

df_2 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro2': [rd.randint(0, 9) for _ in range(2)]
})

print(df_2)
#   Name  Pro2
# 0  foo     3
# 1  bar     2


print(df_1.merge(df_2))
#   Name  Pro1  Pro2
# 0  foo     1     3
# 1  foo     0     3
# 2  bar     4     2
# 3  bar     3     2

Many-to-Many Join

import pandas as pd

df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

df2 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df1)
print(df2)

merged = df1.merge(df2) # type:pd.DataFrame
print(merged)
#   employee        group        skills
# 0      Bob   Accounting          math
# 1      Bob   Accounting  spreadsheets
# 2     Jake  Engineering        coding
# 3     Jake  Engineering         linux
# 4     Lisa  Engineering        coding
# 5     Lisa  Engineering         linux
# 6      Sue           HR  spreadsheets
# 7      Sue           HR  organization

References