home

DataFrame

DataFrame Creation

DataFrame from ndarray

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.random.rand(2, 2), 
    columns=['a', 'b'], 
    index=['x', 'q']
)

print(df)
#           a         b
# x  0.961636  0.223901
# q  0.564983  0.446227

DataFrame from Python Dictionary

import pandas as pd

dataFrame = pd.DataFrame(
    {
        "VIN": [83596, 98567, 14690],
        "MAKE": ["MAZDA", "VW", "KIA"],
        "STOCK": [True, True, False]
    }
)

print(dataFrame)
#      VIN   MAKE  STOCK
# 0  83596  MAZDA   True
# 1  98567     VW   True
# 2  14690    KIA  False

It is also possible to pass a single value to a certain key for constant value for all values in that particular column.

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "City": ["Edirne", "Kırklareli"],
        "Population": [406000, 89000]
    }
)

print(marmara)
#   District        City  Population
# 0  Marmara      Edirne      406000
# 1  Marmara  Kırklareli       89000

Explicit Index in DataFrame

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

DataFrame from List of Lists

import pandas as pd

df = pd.DataFrame(
    [
        [83596, "MAZDA", True],
        [98567, "VW", True],
        [14690, "KIA", True]
    ], columns=["VIN", "MAKE", "STOCK"]
)

print(df)
#      VIN   MAKE  STOCK
# 0  83596  MAZDA   True
# 1  98567     VW   True
# 2  14690    KIA   True

Multi-Level Index

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000]
    }, index=["İzmir", "Manisa"],
)

turkey = marmara.append(ege)
print(turkey)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# İzmir           Ege     4270000
# Manisa          Ege      356000

turkey['City Name'] = turkey.index
print(turkey)
#            District  Population   City Name
# Edirne      Marmara      406000      Edirne
# Kırklareli  Marmara       89000  Kırklareli
# İzmir           Ege     4270000       İzmir
# Manisa          Ege      356000      Manisa

# Reset Index to integers, drop the current index.
turkey = turkey.reset_index(drop=True)
print(turkey)
#    District  Population   City Name
# 0   Marmara      406000      Edirne
# 1   Marmara       89000  Kırklareli
# 2       Ege     4270000       İzmir
# 3       Ege      356000      Manisa

# Multi - Level Indexing
turkey = turkey.set_index(['District', 'City Name'])
print(turkey)
#                      Population
# District City Name
# Marmara  Edirne          406000
#          Kırklareli       89000
# Ege      İzmir          4270000
#          Manisa          356000

# Multi - Level Indexed DataFrame Queries
# Query Example 1
print(turkey.loc['Marmara', 'Edirne'])
# Population    406000
# Name: (Marmara, Edirne), dtype: int64

# Query Example 2
print(turkey.loc[[('Marmara', 'Edirne'), ('Ege', 'Manisa')]])
#                     Population
# District City Name            
# Marmara  Edirne         406000
# Ege      Manisa         356000

# Query Example 3
print(turkey.loc['Marmara'])
#             Population
# City Name             
# Edirne          406000
# Kırklareli       89000

DataFrame Modification

Column Modifications

import pandas as pd

turkey = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000]
    }, index=["Edirne", "Kırklareli"]
)

turkey["Ratio"] = (turkey["Population"] / turkey["Population"].sum()) * 100
print(turkey)
#            District  Population      Ratio
# Edirne      Marmara      406000   1.650440
# Kırklareli  Marmara       89000   0.361796

Index Modifications

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000

marmara.index = ['Ankara', 'Texas', 'Houston', 'Miami']
print(marmara)
#         District  Population
# Ankara   Marmara      406000
# Texas    Marmara       89000
# Houston  Marmara      176000
# Miami    Marmara    15000000

reindex

Index labels must be unique.

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

print(marmara)
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000

marmara = marmara.reindex(["Tekirdağ", "İstanbul", "Edirne", "Kırklareli"])
print(marmara)
#            District  Population
# Tekirdağ    Marmara      176000
# İstanbul    Marmara    15000000
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

# Heads up! This will not work!
marmara = marmara.reindex(['Ankara', 'Texas', 'Houston', 'Miami'])
print(marmara)
#         District  Population
# Ankara       NaN         NaN
# Texas        NaN         NaN
# Houston      NaN         NaN
# Miami        NaN         NaN

using .loc for reindex

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)


marmara = marmara.loc[["Tekirdağ", "İstanbul", "Edirne", "Kırklareli"], ['Population', 'District']]
print(marmara)
#             Population District
# Tekirdağ        176000  Marmara
# İstanbul      15000000  Marmara
# Edirne          406000  Marmara
# Kırklareli       89000  Marmara

Data Selection

import pandas as pd

turkey = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

# Column Selection
print(turkey["Population"])
# Edirne          406000
# Kırklareli       89000
# Tekirdağ        176000
# İstanbul      15000000
# Name: Population, dtype: int64

print(type(turkey["Population"]))
# <class 'pandas.core.series.Series'>

# Row Selection
print(turkey.loc["Edirne"])
# District     Marmara
# Population    406000
# Name: Edirne, dtype: object

print(type(turkey.loc["Edirne"]))
# <class 'pandas.core.series.Series'>

# Masking
print(turkey[turkey["Population"] > 1000000])
#          District  Population
# İstanbul  Marmara    15000000

print(type(turkey[turkey["Population"] > 1000000]))
# <class 'pandas.core.frame.DataFrame'>

# Selecting Multiple Rows
print(turkey.loc[['Edirne', 'Kırklareli']])
#            District  Population
# Edirne      Marmara      406000
# Kırklareli  Marmara       89000

print(type(turkey.loc[['Edirne', 'Kırklareli']]))
# <class 'pandas.core.frame.DataFrame'>

Analysis on DataFrames

import pandas as pd

marmara = pd.DataFrame(
    {
        "District": "Marmara",
        "Population": [406000, 89000, 176000, 15000000]
    }, index=["Edirne", "Kırklareli", "Tekirdağ", "İstanbul"]
)

ege = pd.DataFrame(
    {
        "District": "Ege",
        "Population": [4270000, 356000, 231000, 557000]
    }, index=["İzmir", "Manisa", "Aydın", "Denizli"],
)

akdeniz = pd.DataFrame(
    {
        "District": "Akdeniz",
        "Population": [1730000, 479000, 1230000, 75500]
    }, index=["Adana", "Osmaniye", "Antalya", "Burdur"]
)

turkey = marmara.append(ege).append(akdeniz)

print(turkey.nunique())
# District       3
# Population    12

print(turkey.nsmallest(2, ["Population"])) # also: nlargest
#            District  Population
# Burdur      Akdeniz       75500
# Kırklareli  Marmara       89000

Mathematical Operations

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

# Initial Data
print(df)
#    a  b  c  d
# 0 -8 -7 -6 -5
# 1 -4 -3 -2 -1
# 2  0  1  2  3
# 3  4  5  6  7

# By default every operation will be repeated for each column.
print(df.cumsum())
#     a   b   c   d
# 0   8   7   6   5
# 1  12  10   8   6
# 2  12  11  10   9
# 3  16  16  16  16

Mathematical Operations Using numpy

Official documentation

import numpy as np
import pandas as pd

rng = np.random.RandomState(0)

df = pd.DataFrame(
    rng.randint(-4, 4, size=16).reshape(4, 4),
    columns=['a', 'b', 'c', 'd'],
    index=['x', 'q', 'y', 'z']
)

print(df)
#    a  b  c  d
# x  0  3  1 -4
# q -1 -1 -1  3
# y -3 -1  1 -2
# z  0  3  2 -4

print(np.abs(df))
#    a  b  c  d
# x  0  3  1  4
# q  1  1  1  3
# y  3  1  1  2
# z  0  3  2  4

print(np.sum(df))
# a   -4
# b    4
# c    3
# d   -7
# dtype: int64

print(np.max(df))
# a    0
# b    3
# c    2
# d    3
# dtype: int64

print(np.max(np.max(df)))
# 3

Operations Using apply

import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

print(df)
#    a  b  c  d
# 0 -8 -7 -6 -5
# 1 -4 -3 -2 -1
# 2  0  1  2  3
# 3  4  5  6  7

df = df.apply(lambda x : -x)
print(df)
#    a  b  c  d
# 0  8  7  6  5
# 1  4  3  2  1
# 2  0 -1 -2 -3
# 3 -4 -5 -6 -7

# Applied on each column by default.
df = df.apply(lambda x : min(x))
print(df)
# a   -4
# b   -5
# c   -6
# d   -7
# dtype: int64

df = pd.DataFrame(
    np.arange(-8, 8).reshape(4, 4)
    , columns=['a', 'b', 'c', 'd']
)

# Can be switched to be applied on each row by specifying axis = 1.
df = df.apply(lambda x : min(x), axis=1)
print(df)
# 0   -8
# 1   -4
# 2    0
# 3    4
# dtype: int64

Merging DataFrames

import pandas as pd

df_1 = pd.DataFrame()
df_2 = pd.DataFrame()

pd.merge(df_1, df_2, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

One-to-One Join

Perfect Match

merge function recognizes each DataFrame has an identically named column (employee in this case), and automatically joins using this column as the key.

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  bar     3

df_3 = df_1.merge(df_2)
print(df_3)
#   Name  Pro1  Pro2
# 0  foo     1     4
# 1  bar     0     3

inner Join

Only include keys that are found both in left and right.

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

df_3 = df_1.merge(df_2) # default is inner!
print(df_3)
#   Name  Pro1  Pro2
# 0  foo     1     4

outer Join

Include all keys, found either only on left, either only on right, or both.

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

df_3 = pd.merge(df_1, df_2, how='outer')
print(df_3)
#   Name  Pro1  Pro2
# 0  foo   1.0   4.0
# 1  bar   0.0   NaN
# 2  baz   NaN   3.0

left and right Joins

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(0, 2)]
})

df_2 = pd.DataFrame({
    'Name': ['foo', 'baz'],
    'Pro2': [rd.randint(0, 9) for _ in range(0, 2)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  bar     0

print(df_2)
#   Name  Pro2
# 0  foo     4
# 1  baz     3

print(pd.merge(df_1, df_2, how='left'))
#   Name  Pro1  Pro2
# 0  foo     1   4.0
# 1  bar     0   NaN

print(pd.merge(df_1, df_2, how='right'))
#   Name  Pro1  Pro2
# 0  foo   1.0     4
# 1  baz   NaN     3

Many-to-One Join

import pandas as pd
import random as rd

rd.seed(42)

df_1 = pd.DataFrame({
    'Name': ['foo', 'foo', 'bar', 'bar'],
    'Pro1': [rd.randint(0, 9) for _ in range(4)]
})

print(df_1)
#   Name  Pro1
# 0  foo     1
# 1  foo     0
# 2  bar     4
# 3  bar     3

df_2 = pd.DataFrame({
    'Name': ['foo', 'bar'],
    'Pro2': [rd.randint(0, 9) for _ in range(2)]
})

print(df_2)
#   Name  Pro2
# 0  foo     3
# 1  bar     2


print(df_1.merge(df_2))
#   Name  Pro1  Pro2
# 0  foo     1     3
# 1  foo     0     3
# 2  bar     4     2
# 3  bar     3     2

Many-to-Many Join

import pandas as pd

df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

df2 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
print(df1)
print(df2)

merged = df1.merge(df2) # type:pd.DataFrame
print(merged)
#   employee        group        skills
# 0      Bob   Accounting          math
# 1      Bob   Accounting  spreadsheets
# 2     Jake  Engineering        coding
# 3     Jake  Engineering         linux
# 4     Lisa  Engineering        coding
# 5     Lisa  Engineering         linux
# 6      Sue           HR  spreadsheets
# 7      Sue           HR  organization

References