Шпаргалка по синтаксису Pandas.

Creating DataFrames

df1 = pd.DataFrame(
    {"a":[4 ,5, 6],
     "b":[7, 8, 9],
     "c":[10, 11, 12]},
     index = [1, 2, 3])

df2 = pd.DataFrame(
    [[4, 7, 10],
     [5, 8, 11],
     [6, 9, 12]],
     index=[1, 2, 3],
     columns=['a', 'b', 'c'])

multiindex dataframe

df3 = pd.DataFrame(
    {"a":[4 ,5, 6],
     "b":[7, 8, 9],
     "c":[10, 11, 12]},
     index = pd.MultiIndex.from_tuples(
             [('d',1),('d',2),('e',2)],
              names=['n','v']))

Reshaping Data

df4 = pd.melt(df1, var_name='var', value_name='val')
df4.pivot(columns='var', values='val')
df5 = pd.concat([df1, df2], axis=1)

Sorting, drop, rename

df1.sort_index(ascending=False)
df1.sort_values('a', ascending=False)
df1.drop(columns=['b', 'c'])
df1.rename(columns = {'a':'c1', 'b':'c2', 'c':'c3'})

Subset Rows

df = pd.concat([df1, df2])
df.a >= 5  # return bool series
df[df.a >= 5]  # extract rows that meet logical criteria
df.drop_duplicates()  # remove duplicate rows
df.head(2)  # select first 2 rows
df.tail(2)  # select last 2 rows
df.sample(frac=0.5)  # randomly select fraction of rows
df.sample(n=3)  # randomly select n rows
df.iloc[2:4] # select rows by position
df.nlargest(3, 'b')  # select and order top 3 entries of column b
df.nsmallest(3, 'c') # select and order bottom 3 entries of column c

Subset Columns

df.a
df['a']
df[['a', 'b']]
df.filter(regex='^b$') # slect columns whose name matches regex
df.loc[:, 'b':'c'] # select all columns between b and c
df.iloc[:, [1,2]]  # select columns in positions 1, 2 (first is 0)
df.loc[df['a'] > 5, 'b':'c']

Summarize Data

df['a'].value_counts()
len(df)
df['a'].nunique()
df.describe()  # basic descriptive statistics for each column
df.sum()
df.count()
df.median()
df.quantile([0.25, 0.75, 0.90])
df.apply(sum)
df.min()
df.max()
df.mean()
df.var()
df.std()

Handling Missing Data

foo = df4.pivot(columns='var', values='val')
foo.dropna()
foo.fillna(1)

Make New Columns

df.assign(d=lambda df: df.a*df.b)  # Compute and append new columns
df['d'] = df.a*df.b  # Add single column

Group Data

df.groupby(by="col") # Return a GroupByobject, grouped by values in column named "col".
df.groupby(level="ind") # Return a GroupByobject, grouped by values in index level named "ind".

GroupBy Functions

size() # Size of each group.
agg(function) # Aggregate group using function.
shift(1) # Copy with values shifted by 1.
rank(method='dense') # Ranks with no gaps.
rank(method='min') # Ranks. Ties get min rank
rank(pct=True) # Ranks rescaled to interval [0, 1].
rank(method='first') # Ranks. Ties go to first value.
shift(-1) # Copy with values lagged by 1.
cumsum() # Cumulative sum.
cummax() # Cumulative max.
cummin() # Cumulative min.
cumprod() # Cumulative product.

Windows

df.expanding() # Return an Expanding object allowing summary functions to be applied cumulatively.
df.rolling(n) # Return a Rolling object allowing summary functions to be applied to windows of length n.

Plotting

df.plot.hist() # Histogram for each column
df.plot.scatter(x='w',y='h') # Scatter chart using pairs of points

Combine Data Sets

#Standard Joins
pd.merge(adf, bdf,how='left', on='x1') # Join matching rows from bdfto adf.
pd.merge(adf, bdf,how='right', on='x1') # Join matching rows from adfto bdf.
pd.merge(adf, bdf,how='inner', on='x1') # Join data. Retain only rows in both sets.
pd.merge(adf, bdf,how='outer', on='x1') # Join data. Retain all values, all rows.

#Filtering Joins
adf[adf.x1.isin(bdf.x1)] # All rows in adfthat have a match in bdf.
adf[~adf.x1.isin(bdf.x1)] # All rows in adfthat do not have a match in bdf.

#Set-like Operations
pd.merge(ydf, zdf) # Rows that appear in both ydfand zdf(Intersection).
pd.merge(ydf, zdf, how='outer') # Rows that appear in either or both ydfand zdf(Union).
pd.merge(ydf, zdf, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) # Rows that appear in ydfbut not zdf(Setdiff).

pandas cheat sheet

⤧  Previous post VirtualBox мануал ⤧  Next post Переписываем SQL-запросы в Pandas