4 Pandas Anti-Patterns to Avoid and How to Fix Them

Full write up: 4 Pandas Anti-Patterns to Avoid and How to Fix Them

Code samples notebook

pandas-anti-patterns's People

Contributors

Stargazers

Watchers

pandas-anti-patterns's Issues

Great post

This is a great blog, Aidan! Please consider tidypandas package which wraps pandas with dplyr like API.

Here is how typical tidypandas code looks in piped style.

from tidypandas import tidyframe
from tidypandas.series_utils import dense_rank, case_when, if_else
import pandas as pd
import numpy as np

df = pd.read_csv("Downloads/archive/titles.csv")
df_tidy = tidyframe(df)
df_tidy.glimpse()
#> Rows: 5806
#> Columns: 15
#> id                   <object>  ts300399, tm84618, tm127384, tm70993, tm190788...
#> title                <object>  Five Came Back: The Reference Films, Taxi Driv...
#> type                 <object>  SHOW, MOVIE, MOVIE, MOVIE, MOVIE, SHOW, MOVIE,...
#> description          <object>  This collection includes 12 World War II-era p...
#> release_year         <int64>   1945, 1976, 1975, 1979, 1973, 1969, 1971, 1964...
#> age_certification    <object>  TV-MA, R, PG, R, R, TV-14, R, G, R, R, PG-13, ...
#> runtime              <int64>   48, 113, 91, 94, 133, 30, 102, 170, 104, 110, ...
#> genres               <object>  ['documentation'], ['crime', 'drama'], ['comed...
#> production_countries <object>  ['US'], ['US'], ['GB'], ['GB'], ['US'], ['GB']...
#> seasons              <float64> 1.0, nan, nan, nan, nan, 4.0, nan, nan, nan, n...
#> imdb_id              <object>  nan, tt0075314, tt0071853, tt0079470, tt007004...
#> imdb_score           <float64> nan, 8.3, 8.2, 8.0, 8.1, 8.8, 7.7, 7.8, 5.8, 7...
#> imdb_votes           <float64> nan, 795222.0, 530877.0, 392419.0, 391942.0, 7...
#> tmdb_popularity      <float64> 0.6, 27.61, 18.22, 17.5, 95.34, 12.92, 14.74, ...
#> tmdb_score           <float64> nan, 8.2, 7.8, 7.8, 7.7, 8.3, 7.5, 7.6, 6.2, 7...

(df_tidy.filter("runtime > 30 & type == 'SHOW'")
        .mutate({"production_countries": ('x.str.replace("[", "")', )})
        .mutate({"production_countries": ('x.str.replace("]", "")', )})
        .separate(column_name = 'production_countries',
                  sep = ',',
                  into = ['pc1', 'pc2', 'pc3'],
                  strict = False
                  )
        .add_count('pc1', name = "pc1_count")
        .mutate({'pc1_rank': (lambda x: dense_rank(x, ascending = False), 'pc1_count')})
        .mutate({'top_rank': (lambda x: case_when(
                    [(x <= 3, 'top_03'),(x <= 10, 'top_10'),(x <= 20, 'top_20')],
                    default = 'not_top'
                    ),
                  'pc1_rank'
                  )
                 })
        .mutate({'top_rank': (pd.Categorical, )})
        .mutate({'imdb_score': lambda x: if_else(x['release_year'] > 2016,
                                                 x["imdb_score"] - 1,
                                                 x["imdb_score"]
                                                 )
                })
        .mutate({'score': ('x + y', ['imdb_score', 'tmdb_score'])})
        .summarise({'score_count': (len, 'score'),
                    'score_mean': (np.mean, 'score')
                   },
                   by = 'top_rank'
                   )
        .arrange([('score_mean', 'desc')])
        )

#> # A tidy dataframe: 4 X 3
    top_rank  score_count  score_mean
  <category>    <Float64>   <Float64>
0     top_03        639.0   14.071599
1     top_20        180.0   13.714667
2    not_top         94.0   13.636111
3     top_10        326.0   13.533333