This is our first post

Шпаргалка по синтаксису Pandas.

Creating DataFrames

df1 = pd.DataFrame(
    {"a":[4 ,5, 6],
     "b":[7, 8, 9],
     "c":[10, 11, 12]},
     index = [1, 2, 3])

df2 = pd.DataFrame(
    [[4, 7, 10],
     [5, 8, 11],
     [6, 9, 12]],
     index=[1, 2, 3],
     columns=['a', 'b', 'c'])

multiindex dataframe

df3 = pd.DataFrame(
    {"a":[4 ,5, 6],
     "b":[7, 8, 9],
     "c":[10, 11, 12]},
     index = pd.MultiIndex.from_tuples(
             [('d',1),('d',2),('e',2)],
              names=['n','v']))

Reshaping Data

df4 = pd.melt(df1, var_name='var', value_name='val')
df4.pivot(columns='var', values='val')
df5 = pd.concat([df1, df2], axis=1)

Sorting, drop, rename

df1.sort_index(ascending=False)
df1.sort_values('a', ascending=False)
df1.drop(columns=['b', 'c'])
df1.rename(columns = {'a':'c1', 'b':'c2', 'c':'c3'})

Subset Rows

df = pd.concat([df1, df2])
df.a >= 5  # return bool series
df[df.a >= 5]  # extract rows that meet logical criteria
df.drop_duplicates()  # remove duplicate rows
df.head(2)  # select first 2 rows
df.tail(2)  # select last 2 rows
df.sample(frac=0.5)  # randomly select fraction of rows
df.sample(n=3)  # randomly select n rows
df.iloc[2:4] # select rows by position
df.nlargest(3, 'b')  # select and order top 3 entries of column b
df.nsmallest(3, 'c') # select and order bottom 3 entries of column c

Subset Columns

df.a
df['a']
df[['a', 'b']]
df.filter(regex='^b$') # slect columns whose name matches regex
df.loc[:, 'b':'c'] # select all columns between b and c
df.iloc[:, [1,2]]  # select columns in positions 1, 2 (first is 0)
df.loc[df['a'] > 5, 'b':'c']

Summarize Data

df['a'].value_counts()
len(df)
df['a'].nunique()
df.describe()  # basic descriptive statistics for each column
df.sum()
df.count()
df.median()
df.quantile([0.25, 0.75, 0.90])
df.apply(sum)
df.min()
df.max()
df.mean()
df.var()
df.std()

Handling Missing Data

foo = df4.pivot(columns='var', values='val')
foo.dropna()
foo.fillna(1)

Make New Columns

df.assign(d=lambda df: df.a*df.b)  # Compute and append new columns
df['d'] = df.a*df.b  # Add single column

Group Data

df.groupby(by="col") # Return a GroupByobject, grouped by values in column named "col".
df.groupby(level="ind") # Return a GroupByobject, grouped by values in index level named "ind".

GroupBy Functions

size() # Size of each group.
agg(function) # Aggregate group using function.
shift(1) # Copy with values shifted by 1.
rank(method='dense') # Ranks with no gaps.
rank(method='min') # Ranks. Ties get min rank
rank(pct=True) # Ranks rescaled to interval [0, 1].
rank(method='first') # Ranks. Ties go to first value.
shift(-1) # Copy with values lagged by 1.
cumsum() # Cumulative sum.
cummax() # Cumulative max.
cummin() # Cumulative min.
cumprod() # Cumulative product.

Windows

df.expanding() # Return an Expanding object allowing summary functions to be applied cumulatively.
df.rolling(n) # Return a Rolling object allowing summary functions to be applied to windows of length n.

Plotting

df.plot.hist() # Histogram for each column
df.plot.scatter(x='w',y='h') # Scatter chart using pairs of points

Combine Data Sets

#Standard Joins
pd.merge(adf, bdf,how='left', on='x1') # Join matching rows from bdfto adf.
pd.merge(adf, bdf,how='right', on='x1') # Join matching rows from adfto bdf.
pd.merge(adf, bdf,how='inner', on='x1') # Join data. Retain only rows in both sets.
pd.merge(adf, bdf,how='outer', on='x1') # Join data. Retain all values, all rows.

#Filtering Joins
adf[adf.x1.isin(bdf.x1)] # All rows in adfthat have a match in bdf.
adf[~adf.x1.isin(bdf.x1)] # All rows in adfthat do not have a match in bdf.

#Set-like Operations
pd.merge(ydf, zdf) # Rows that appear in both ydfand zdf(Intersection).
pd.merge(ydf, zdf, how='outer') # Rows that appear in either or both ydfand zdf(Union).
pd.merge(ydf, zdf, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) # Rows that appear in ydfbut not zdf(Setdiff).

pandas cheat sheet

⤧ Previous post VirtualBox мануал ⤧ Next post Переписываем SQL-запросы в Pandas

Pandas CheatSheet

Creating DataFrames

multiindex dataframe

Reshaping Data

Sorting, drop, rename

Subset Rows

Subset Columns

Summarize Data

Handling Missing Data

Make New Columns

Group Data

GroupBy Functions

Windows

Plotting

Combine Data Sets