from tidypandas import tidyframe
from tidypandas.series_utils import dense_rank, case_when, if_else
import pandas as pd
import numpy as np
df = pd.read_csv("Downloads/archive/titles.csv")
df_tidy = tidyframe(df)
df_tidy.glimpse()
#> Rows: 5806
#> Columns: 15
#> id <object> ts300399, tm84618, tm127384, tm70993, tm190788...
#> title <object> Five Came Back: The Reference Films, Taxi Driv...
#> type <object> SHOW, MOVIE, MOVIE, MOVIE, MOVIE, SHOW, MOVIE,...
#> description <object> This collection includes 12 World War II-era p...
#> release_year <int64> 1945, 1976, 1975, 1979, 1973, 1969, 1971, 1964...
#> age_certification <object> TV-MA, R, PG, R, R, TV-14, R, G, R, R, PG-13, ...
#> runtime <int64> 48, 113, 91, 94, 133, 30, 102, 170, 104, 110, ...
#> genres <object> ['documentation'], ['crime', 'drama'], ['comed...
#> production_countries <object> ['US'], ['US'], ['GB'], ['GB'], ['US'], ['GB']...
#> seasons <float64> 1.0, nan, nan, nan, nan, 4.0, nan, nan, nan, n...
#> imdb_id <object> nan, tt0075314, tt0071853, tt0079470, tt007004...
#> imdb_score <float64> nan, 8.3, 8.2, 8.0, 8.1, 8.8, 7.7, 7.8, 5.8, 7...
#> imdb_votes <float64> nan, 795222.0, 530877.0, 392419.0, 391942.0, 7...
#> tmdb_popularity <float64> 0.6, 27.61, 18.22, 17.5, 95.34, 12.92, 14.74, ...
#> tmdb_score <float64> nan, 8.2, 7.8, 7.8, 7.7, 8.3, 7.5, 7.6, 6.2, 7...
(df_tidy.filter("runtime > 30 & type == 'SHOW'")
.mutate({"production_countries": ('x.str.replace("[", "")', )})
.mutate({"production_countries": ('x.str.replace("]", "")', )})
.separate(column_name = 'production_countries',
sep = ',',
into = ['pc1', 'pc2', 'pc3'],
strict = False
)
.add_count('pc1', name = "pc1_count")
.mutate({'pc1_rank': (lambda x: dense_rank(x, ascending = False), 'pc1_count')})
.mutate({'top_rank': (lambda x: case_when(
[(x <= 3, 'top_03'),(x <= 10, 'top_10'),(x <= 20, 'top_20')],
default = 'not_top'
),
'pc1_rank'
)
})
.mutate({'top_rank': (pd.Categorical, )})
.mutate({'imdb_score': lambda x: if_else(x['release_year'] > 2016,
x["imdb_score"] - 1,
x["imdb_score"]
)
})
.mutate({'score': ('x + y', ['imdb_score', 'tmdb_score'])})
.summarise({'score_count': (len, 'score'),
'score_mean': (np.mean, 'score')
},
by = 'top_rank'
)
.arrange([('score_mean', 'desc')])
)
#> # A tidy dataframe: 4 X 3
top_rank score_count score_mean
<category> <Float64> <Float64>
0 top_03 639.0 14.071599
1 top_20 180.0 13.714667
2 not_top 94.0 13.636111
3 top_10 326.0 13.533333