Initialization

import pandas as pd
from datetime import datetime
import numpy as np
from numpy import sqrt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
import statsmodels
import statsmodels.stats.api as sms
from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
from sklearn.utils import check_array
from scipy import stats
from scipy.special import boxcox1p
from tqdm import tqdm_notebook
from sklearn.model_selection import validation_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

%matplotlib inline
#style.use('seaborn-poster') #sets the size of the charts
style.use('ggplot')

#!sudo apt-get install python-sklearn

import warnings
warnings.filterwarnings('ignore')

movies1 = pd.read_csv('movies_metadata.csv')
movies2 = pd.read_csv('movie_metadata.csv')

Data Cleaning & Preprocessing

print(movies1.shape)
print(movies2.shape)

(45466, 24)
(5043, 28)

movies1.head(1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	adult	belongs_to_collection	budget	genres	homepage	id	imdb_id	original_language	original_title	overview	...	release_date	revenue	runtime	spoken_languages	status	tagline	title	video	vote_average	vote_count
0	False	{'id': 10194, 'name': 'Toy Story Collection', ...	30000000	[{'id': 16, 'name': 'Animation'}, {'id': 35, '...	http://toystory.disney.com/toy-story	862	tt0114709	en	Toy Story	Led by Woody, Andy's toys live happily in his ...	...	1995-10-30	373554033.0	81.0	[{'iso_639_1': 'en', 'name': 'English'}]	Released	NaN	Toy Story	False	7.7	5415.0

1 rows × 24 columns

movies2.head(1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
0	Color	James Cameron	723.0	178.0	0.0	855.0	Joel David Moore	1000.0	760505847.0	Action\|Adventure\|Fantasy\|Sci-Fi	...	3054.0	English	USA	PG-13	237000000.0	2009.0	936.0	7.9	1.78	33000

1 rows × 28 columns

Both datasets are amazing but one is massive compared to another. I want to know if I should combine both to get more features but lesser samples which could possibly increase accuracy, or should I use the dataset with 40k and rely on training samples alone. I'll try the former option first.

movies2['imdb_id'] = movies2['movie_imdb_link'].apply(lambda x: x.split('/')[4])
movies = pd.merge(left = movies1, right = movies2, left_on = 'imdb_id', right_on = 'imdb_id')

Since I'm using two datasets, there are overlapping features. I will discard the redundants with lesser data. For example, if one dataset has more genre specification than the other, I will prefer it more.

Removing null values:

movies.head(1)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	adult	belongs_to_collection	budget_x	genres_x	homepage	id	imdb_id	original_language	original_title	overview	...	num_user_for_reviews	language	country	content_rating	budget_y	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
0	False	{'id': 10194, 'name': 'Toy Story Collection', ...	30000000	[{'id': 16, 'name': 'Animation'}, {'id': 35, '...	http://toystory.disney.com/toy-story	862	tt0114709	en	Toy Story	Led by Woody, Andy's toys live happily in his ...	...	391.0	English	USA	G	30000000.0	1995.0	1000.0	8.3	1.85	0

1 rows × 52 columns

print(movies['genres_x'].fillna('[]').apply(literal_eval).apply(lambda x: len([i['name'] for i in x]) if isinstance(x, list) else []).mean())
print(movies['genres_y'].apply(lambda x: len(x.split("|"))).mean())

2.5678881592882865
2.9112476170302903

As evident from above, the genre feature from the second dataset has more data, so I will discard the genre column from the first. I'm not sure if this will help increase the prediction accuracy in anyway, but I'll give it a shot.

Also going to be taking only one genre from the list of genres. This will of course decrease accuracy but I don't know how to incorporate multiple genres into my algorithm. I could one-hot encode them, but that would mean my algorithm would give preference to movies with more listed genres, which is not an indicator of success.

movies['genre'] = movies['genres_y'].apply(lambda x: x.split("|")[0])

movies['genre'].unique()

array(['Adventure', 'Action', 'Comedy', 'Biography', 'Drama', 'Crime',
       'Documentary', 'Fantasy', 'Sci-Fi', 'Animation', 'Horror',
       'Film-Noir', 'Family', 'Western', 'Thriller', 'Musical', 'Mystery',
       'Romance', 'Music'], dtype=object)

movies.shape

(4721, 53)

'''
one_hot_genres = pd.get_dummies(movies['genre'], prefix = 'category')
one_hot_genre_cols = one_hot_genres.columns
movies = pd.concat([movies, one_hot_genres], axis=1, sort = False)
'''

"\none_hot_genres = pd.get_dummies(movies['genre'], prefix = 'category')\none_hot_genre_cols = one_hot_genres.columns\nmovies = pd.concat([movies, one_hot_genres], axis=1, sort = False)\n"

Another thing that I notice is discrepancies between similar columns, for example the gross/revenue. They seem to differ by a lot. How much? We'll find out.

(abs(movies['revenue'] - movies['gross']) / (movies['revenue'] + movies['gross']) * 100).mean()

43.07130868751626

43% difference. No thank you. After doing a bit of digging around, I find that the dataset movies2 contained movie revenue for USA alone, while movies1 has the global revenue. What I want is the former, so I will discard the 'revenue' column from movie1 and keep 'gross'.

There's another interesting column, 'belongs_to_collection'. I will binarize, as it makes sense as to conclude that a movie will earn more if it belongs to a collection.

movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(lambda x: 0 if pd.isna(x) else 1)

Converting to datetime format for convenience.

months = ['Placeholder', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
movies['release_date'] = movies['release_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
movies['release_month'] = movies['release_date'].apply(lambda x: months[x.month])
movies['release_year'] = movies['release_date'].apply(lambda x: x.year)

Converting to lists:

movies['production_companies'] = movies['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['production_countries'] = movies['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['spoken_languages'] = movies['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

Plot keywords will help a ton in building movie recommendation systems, but the goal here is to predict the profit of a movie. So it's of no use here.

columns_to_drop = ['revenue', 'movie_imdb_link', 'genres_x', 'genres_y', 'homepage', 'id', 'imdb_id', 'overview', 
                   'poster_path', 'status', 'tagline', 'movie_title', 'original_language', 'original_title', 'video',
                  'budget_x', 'language', 'country', 'adult', 'plot_keywords', 'aspect_ratio', 'runtime', 'title_year']
                   
movies = movies.drop(columns_to_drop, axis = 1).rename(columns = {'budget_y' : 'budget', 'gross' : 'revenue'})

movies = movies[movies['production_countries'].apply(lambda x: True if 'United States of America' in x else False)]
movies = movies[movies['spoken_languages'].apply(lambda x: True if 'English' in x else False)]

movies.median()

belongs_to_collection        0.000000e+00
popularity                   8.754132e+00
vote_average                 6.200000e+00
vote_count                   3.350000e+02
num_critic_for_reviews       1.250000e+02
duration                     1.050000e+02
director_facebook_likes      5.700000e+01
actor_3_facebook_likes       4.345000e+02
actor_1_facebook_likes       1.000000e+03
revenue                      3.240374e+07
num_voted_users              4.622100e+04
cast_total_facebook_likes    3.924000e+03
facenumber_in_poster         1.000000e+00
num_user_for_reviews         1.910000e+02
budget                       2.300000e+07
actor_2_facebook_likes       6.800000e+02
imdb_score                   6.500000e+00
movie_facebook_likes         2.090000e+02
release_year                 2.005000e+03
dtype: float64

print(movies.shape)
movies = movies.dropna()
#movies = movies.fillna(movies.median())
print(movies.shape)

(3943, 32)
(3358, 32)

Exploratory Data Analysis & Feature Selection

movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3358 entries, 0 to 4713
Data columns (total 32 columns):
belongs_to_collection        3358 non-null int64
popularity                   3358 non-null object
production_companies         3358 non-null object
production_countries         3358 non-null object
release_date                 3358 non-null datetime64[ns]
spoken_languages             3358 non-null object
title                        3358 non-null object
vote_average                 3358 non-null float64
vote_count                   3358 non-null float64
color                        3358 non-null object
director_name                3358 non-null object
num_critic_for_reviews       3358 non-null float64
duration                     3358 non-null float64
director_facebook_likes      3358 non-null float64
actor_3_facebook_likes       3358 non-null float64
actor_2_name                 3358 non-null object
actor_1_facebook_likes       3358 non-null float64
revenue                      3358 non-null float64
actor_1_name                 3358 non-null object
num_voted_users              3358 non-null int64
cast_total_facebook_likes    3358 non-null int64
actor_3_name                 3358 non-null object
facenumber_in_poster         3358 non-null float64
num_user_for_reviews         3358 non-null float64
content_rating               3358 non-null object
budget                       3358 non-null float64
actor_2_facebook_likes       3358 non-null float64
imdb_score                   3358 non-null float64
movie_facebook_likes         3358 non-null int64
genre                        3358 non-null object
release_month                3358 non-null object
release_year                 3358 non-null int64
dtypes: datetime64[ns](1), float64(13), int64(5), object(13)
memory usage: 865.7+ KB

movies.describe().shape

(8, 18)

movies.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	belongs_to_collection	vote_average	vote_count	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	revenue	num_voted_users	cast_total_facebook_likes	facenumber_in_poster	num_user_for_reviews	budget	actor_2_facebook_likes	imdb_score	movie_facebook_likes	release_year
count	3358.000000	3358.000000	3358.000000	3358.000000	3358.000000	3358.000000	3358.000000	3358.000000	3.358000e+03	3.358000e+03	3358.000000	3358.000000	3358.000000	3.358000e+03	3358.000000	3358.000000	3358.000000	3358.000000
mean	0.250744	6.190887	954.145920	169.294223	110.233770	860.966647	824.218285	8135.464562	5.732073e+07	1.104946e+05	12190.299285	1.414532	351.083681	4.103701e+07	2171.393985	6.407058	9568.661108	2003.037522
std	0.433506	0.882913	1448.036597	124.773653	22.353895	3209.121228	1973.656712	15538.326953	7.243680e+07	1.567349e+05	19333.196884	2.099191	423.934717	4.410129e+07	4729.843994	1.055894	21847.848698	9.846291
min	0.000000	0.000000	0.000000	1.000000	37.000000	0.000000	0.000000	0.000000	1.620000e+02	1.030000e+02	0.000000	0.000000	6.000000	2.180000e+02	0.000000	1.600000	0.000000	1929.000000
25%	0.000000	5.700000	138.000000	78.000000	96.000000	11.000000	230.250000	807.250000	1.220310e+07	2.138275e+04	2172.250000	0.000000	116.000000	1.200000e+07	435.250000	5.800000	0.000000	1999.000000
50%	0.000000	6.200000	416.000000	140.000000	106.000000	65.000000	462.500000	2000.000000	3.395762e+07	5.703900e+04	4529.000000	1.000000	217.000000	2.600000e+07	723.000000	6.500000	245.000000	2004.000000
75%	1.000000	6.800000	1120.500000	226.000000	120.000000	241.000000	719.750000	13000.000000	7.266003e+07	1.353745e+05	16962.500000	2.000000	417.500000	5.500000e+07	1000.000000	7.100000	11000.000000	2010.000000
max	1.000000	9.300000	14075.000000	813.000000	330.000000	23000.000000	23000.000000	640000.000000	7.605058e+08	1.689764e+06	656730.000000	43.000000	5060.000000	3.000000e+08	137000.000000	9.300000	349000.000000	2016.000000

The extremely left skewed distribution is called a pareto distribution.

fig, ax = plt.subplots(figsize = (15, 5))
sns.distplot(movies['revenue'])
plt.xlabel('Revenue')
#fig.savefig('revenue_dist.png', format='png', dpi=1200)
#files.download("revenue_dist.png")

Text(0.5, 0, 'Revenue')

from pylab import figure, text, scatter, show
fix, ax = plt.subplots()
plt.figure(figsize = (10, 5))
sns.regplot(x = movies['vote_count'], y = movies['revenue'], scatter_kws = {'s':2})
sns.despine(top = True, right = True)
#text(0.2, 1.0, 'Correlation: ' + str(np.corrcoef(movies['vote_count'], movies['revenue'])[0,1].round(4)), horizontalalignment='center', verticalalignment='center', transform = ax.transAxes)
#plt.savefig('corr.png', dpi = 1200)
#files.download('corr.png')

np.corrcoef(movies['vote_count'], movies['revenue'])[0, 1]

0.7514927348521296

plt.figure(figsize = (20, 10))
sns.countplot(x = 'release_year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 90, fontsize=8)
#plt.savefig('count_of_movies_each_year.png', dpi = 1200)
#files.download('count_of_movies_each_year.png')

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69]), <a list of 70 Text xticklabel objects>)

Movies released before 1990 will not be taken into consideration as only a few movies are released.

pd.DataFrame(movies.groupby('release_year').sum()['budget'])

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	budget
release_year
1929	3.790000e+05
1933	4.390000e+05
1935	6.090000e+05
1936	1.500000e+06
1937	2.000000e+06
1939	6.777000e+06
1940	2.600000e+06
1946	1.010000e+07
1947	2.300000e+06
1948	3.700000e+06
1950	3.768785e+06
1952	4.000000e+06
1953	5.210000e+06
1954	9.100000e+05
1957	3.000000e+06
1959	2.883848e+06
1960	8.069470e+05
1961	6.000000e+06
1963	4.051500e+07
1964	2.600000e+07
1965	5.200000e+07
1968	1.200000e+07
1969	1.300000e+07
1970	5.150000e+07
1971	1.670000e+07
1972	6.010000e+06
1973	1.477700e+07
1974	3.206706e+07
1975	1.240000e+07
1976	9.960000e+06
...	...
1987	3.602500e+08
1988	4.192000e+08
1989	5.968600e+08
1990	8.792480e+08
1991	7.795000e+08
1992	8.481000e+08
1993	8.386000e+08
1994	1.499630e+09
1995	2.186925e+09
1996	2.977320e+09
1997	3.741000e+09
1998	4.006170e+09
1999	4.838312e+09
2000	5.359192e+09
2001	5.789840e+09
2002	6.093225e+09
2003	5.601130e+09
2004	6.379122e+09
2005	6.891215e+09
2006	6.323250e+09
2007	5.850065e+09
2008	6.860100e+09
2009	7.190425e+09
2010	7.858315e+09
2011	6.690365e+09
2012	7.433295e+09
2013	8.024550e+09
2014	7.423336e+09
2015	7.074595e+09
2016	4.572400e+09

70 rows × 1 columns

movies_after_1995 = movies[(movies['release_year'] >= 1995) & (movies['release_year'] != 2017)]

movies_numerical = movies_after_1995.select_dtypes(include = 'number').dropna()
pt = movies_after_1995.pivot_table(index = 'release_month', columns = 'release_year', values = 'revenue').reindex(index = months[1:])
fig, ax = plt.subplots(figsize=(15, 7)) 
ax.tick_params(axis="x", labelsize = 15)
ax.tick_params(axis="y", labelsize = 15)
sns.heatmap(pt)
#plt.savefig('heatmap.png', dpi = 1200)
#files.download('heatmap.png')

<matplotlib.axes._subplots.AxesSubplot at 0x21673af6780>

top_10_directors = list(movies.groupby('director_name').sum().sort_values(by = 'revenue', ascending = False).head(10).reset_index()['director_name'])
top_10_directors_revenue = list(movies.groupby('director_name').sum().sort_values(by = 'revenue', ascending = False).head(10).reset_index()['revenue'])
directors_and_revenue_dict = dict(zip(top_10_directors, top_10_directors_revenue))
movies_numerical['top_director'] = movies['director_name'].apply(lambda x: 1 if x in top_10_directors else 0)

movies.groupby('director_name').sum().sort_values(by = 'revenue', ascending = False).head(10).reset_index()[['director_name', 'revenue']]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	director_name	revenue
0	Steven Spielberg	4.114233e+09
1	Peter Jackson	2.286919e+09
2	Michael Bay	2.231243e+09
3	Tim Burton	2.071275e+09
4	Sam Raimi	2.049549e+09
5	James Cameron	1.948126e+09
6	Christopher Nolan	1.813228e+09
7	George Lucas	1.741418e+09
8	Joss Whedon	1.730887e+09
9	Robert Zemeckis	1.619309e+09

fig, ax = plt.subplots(figsize=(15, 7)) 
ax = sns.barplot(x = 'genre', y = 'revenue', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('Average Revenue per Genre')
plt.show()
#plt.savefig('avg_revenue_genre.png', format = 'png', dpi = 1200)
#files.download('avg_revenue_genre.png')
#movies = movies.drop('genre', axis = 1)

movies_numerical['action_or_adventure'] = movies['genre'].apply(lambda x: 1 if x == 'Action' or x == 'Adventure' else 0)

Backward Elimination

Using a significance level (SL) of 0.05 (the industry standard), I will remove all features with a p-value greater than the SL, although one at a time. Each iteration, the highest p-value is removed.

Afterwards, I recompute the p-values again and repeat until no features have p-values above SL.

Also, below code is needed because statsmodel doesn't include the column of ones by default.

X = movies_numerical.loc[:, movies_numerical.columns != 'revenue']
Y = movies_numerical['revenue']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 20)
model = LinearRegression(fit_intercept = True)
model.fit(X_train, Y_train)#, sample_weight = self.sample_weight)
intercept = model.intercept_
print(intercept)

813787443.4240371

movies_with_intercept = movies_numerical.copy()
#movies_with_ones = statsmodels.tools.tools.add_constant(movies_with_ones)
movies_with_intercept ['intercept'] = intercept
#movies_with_ones['const'] = 1
X = movies_with_intercept .loc[:, movies_with_intercept .columns != 'revenue']
Y = movies_with_intercept ['revenue']
insignificant_cols = ['facenumber_in_poster', 'num_critic_for_reviews', 'release_year']
X = X.drop(insignificant_cols, axis = 1)
regressor_OLS = sm.OLS(endog = Y, exog = X, hasconst = True).fit()
regressor_OLS.summary()

OLS Regression Results

Dep. Variable:	revenue	R-squared:	0.738
Model:	OLS	Adj. R-squared:	0.736
Method:	Least Squares	F-statistic:	509.3
Date:	Fri, 22 May 2020	Prob (F-statistic):	0.00
Time:	23:53:35	Log-Likelihood:	-54918.
No. Observations:	2912	AIC:	1.099e+05
Df Residuals:	2895	BIC:	1.100e+05
Df Model:	16
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
belongs_to_collection	2.26e+07	1.89e+06	11.944	0.000	1.89e+07	2.63e+07
vote_average	7.328e+06	1.89e+06	3.881	0.000	3.63e+06	1.1e+07
vote_count	3.181e+04	1451.774	21.909	0.000	2.9e+04	3.47e+04
duration	-1.187e+05	4.36e+04	-2.720	0.007	-2.04e+05	-3.31e+04
director_facebook_likes	-1418.4105	255.132	-5.560	0.000	-1918.670	-918.151
actor_3_facebook_likes	-5949.3429	978.993	-6.077	0.000	-7868.936	-4029.749
actor_1_facebook_likes	-5024.5994	592.564	-8.479	0.000	-6186.489	-3862.710
num_voted_users	-37.6561	13.431	-2.804	0.005	-63.991	-11.322
cast_total_facebook_likes	4927.6744	591.545	8.330	0.000	3767.783	6087.566
num_user_for_reviews	1.152e+04	2797.639	4.119	0.000	6038.338	1.7e+04
budget	0.5461	0.023	23.411	0.000	0.500	0.592
actor_2_facebook_likes	-4896.6107	626.430	-7.817	0.000	-6124.905	-3668.317
imdb_score	-3.554e+06	1.6e+06	-2.223	0.026	-6.69e+06	-4.19e+05
movie_facebook_likes	-514.4919	44.593	-11.537	0.000	-601.929	-427.054
top_director	3.654e+07	4.79e+06	7.628	0.000	2.72e+07	4.59e+07
action_or_adventure	-1.05e+07	1.77e+06	-5.936	0.000	-1.4e+07	-7.03e+06
intercept	-0.0096	0.008	-1.210	0.226	-0.025	0.006

Omnibus:	1079.965	Durbin-Watson:	1.937
Prob(Omnibus):	0.000	Jarque-Bera (JB):	11922.721
Skew:	1.437	Prob(JB):	0.00
Kurtosis:	12.487	Cond. No.	5.60e+09

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.6e+09. This might indicate that there are
strong multicollinearity or other numerical problems.

Omnibus/Prob(Omnibus) – a test of the skewness and kurtosis of the residual (characteristic #2). We hope to see a value close to zero which would indicate normalcy. The Prob (Omnibus) performs a statistical test indicating the probability that the residuals are normally distributed. We hope to see something close to 1 here. In this case Omnibus is relatively low and the Prob (Omnibus) is relatively high so the data is somewhat normal, but not altogether ideal. A linear regression approach would probably be better than random guessing but likely not as good as a nonlinear approach.

Skew – a measure of data symmetry. We want to see something close to zero, indicating the residual distribution is normal. Note that this value also drives the Omnibus. This result has a small, and therefore good, skew.

Kurtosis – a measure of "peakiness", or curvature of the data. Higher peaks lead to greater Kurtosis. Greater Kurtosis can be interpreted as a tighter clustering of residuals around zero, implying a better model with few outliers.

Durbin-Watson – tests for homoscedasticity (characteristic #3). We hope to have a value between 1 and 2. In this case, the data is close, but within limits.

Jarque-Bera (JB)/Prob(JB) – like the Omnibus test in that it tests both skew and kurtosis. We hope to see in this test a confirmation of the Omnibus test. In this case we do.

Condition Number – This test measures the sensitivity of a function's output as compared to its input (characteristic #4). When we have multicollinearity, we can expect much higher fluctuations to small changes in the data, hence, we hope to see a relatively small number, something below 30. In this case we are well below 30, which we would expect given our model only has two variables and one is a constant.

movies_be = movies_numerical.copy().drop(insignificant_cols, axis = 1)

plt.figure(figsize = (18, 13))
sns.set_style("white")

corr_df = movies_be.copy()
corr = corr_df.corr().round(2)

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

#sns.heatmap(corr, annot = True, cmap = 'viridis', mask = mask)
heatmap = sns.heatmap(corr, annot = True, cmap = 'viridis', mask = mask)
bottom, top = heatmap.get_ylim()
heatmap.set_ylim(bottom + 0.6, top - 0.5)
fig = heatmap.get_figure()
#fig.savefig('heatmap.png', format = 'png', dpi = 1200)
plt.show()

Feature Selection Stuff

Select k best / Univariate Selection

from sklearn.feature_selection import SelectKBest, chi2

X = movies_be.copy().loc[:, movies_be.columns != 'revenue']
y = movies_be.copy()['revenue']

bestfeatures = SelectKBest(score_func=chi2, k=9)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(9,'Score')) 

k_best_cols = list(featureScores.nlargest(9,'Score').iloc[:, 0])
k_best_cols.append('revenue')

                        Specs         Score
10                     budget  1.385553e+11
7             num_voted_users  5.994871e+08
13       movie_facebook_likes  1.483301e+08
8   cast_total_facebook_likes  9.281176e+07
6      actor_1_facebook_likes  9.042603e+07
4     director_facebook_likes  3.489936e+07
11     actor_2_facebook_likes  3.058759e+07
5      actor_3_facebook_likes  1.452063e+07
2                  vote_count  6.464257e+06

Feature Importance

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt

model = ExtraTreesClassifier()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

feature_important_cols = list(feat_importances.nlargest(10).keys())
feature_important_cols.append('revenue')

[0.01470752 0.07326097 0.07651357 0.07766461 0.07487516 0.07774061
 0.07112115 0.07791485 0.07454699 0.07836348 0.07573893 0.0770569
 0.075152   0.0598933  0.00202457 0.0134254 ]

Machine Learning

Note to self: Click here for working Predictor class dated 4/8/2019

Linear Regression.Note to self: Using features with a correlation coefficient in the range 0.1 to 0.9 yielded no changes in accuracies in comparison to using the whole dataset.

Note to self: Box-Cox and Log transforming the dependent variable gave a decrease/no change in accuracy.

#results_df.drop(results_df.index, inplace=True)

Linear Regression

Normalization is not needed but is done for easy interpretation of the error metrics. Standardization is also not required here.

from sklearn.preprocessing import PowerTransformer

dataset = movies_be.copy().drop(['action_or_adventure', 'top_director'], axis = 1)

cv_sets = 10
corrs = []
iterations = 100
normalize = True
results_list = ['Linear Regression']

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']

model = LinearRegression(fit_intercept = True)
model.fit(X, y)

mse_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_squared_error').mean()
mae_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_absolute_error').mean()
r2_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'r2').mean()

# Calculate correlation between predicted and actual for a number of iterations and get the average

for i in tqdm_notebook(range(iterations), total = iterations, unit = 'iteration'):
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  model = LinearRegression(fit_intercept = True)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])

results_list.append(sum(corrs)/iterations)
results_list.append(abs(mae_mean))
results_list.append(abs(mse_mean))
results_list.append(sqrt(abs(mse_mean)))
results_list.append(abs(r2_mean))

results_df = pd.DataFrame([results_list], columns = ['Algorithm', 'Correlation', 'MAE', 'MSE', 'RMSE', 'R2'])
results_df

HBox(children=(IntProgress(value=0), HTML(value='')))

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Algorithm	Correlation	MAE	MSE	RMSE	R2
0	Linear Regression	0.848596	0.032815	0.002603	0.051022	0.70225

Support Vector Regression

Note: When using standardization, the

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

corrs = []
mses = []
maes = []
r2s = []
rmses = []
results_list = ['Support Vector']

for i in range(100):
  dataset = movies_be.copy()
  cv_sets = 10
  normalize = True
  standardize = False


  if normalize:
      for column in dataset.columns:
        dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

  X = dataset.loc[:, dataset.columns != 'revenue']
  y = dataset['revenue']

  if standardize:
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    X = sc_X.fit_transform(X)
    y = sc_y.fit_transform(y.values.reshape(-1, 1))

  X_train, X_test, y_train, y_test = train_test_split(X, y) #, test_size = 0.2)

  model = SVR(kernel = 'linear')
  model.fit(X_train, y_train)

  if standardize:
    y_pred = sc_y.inverse_transform(model.predict(sc_X.transform(X_test)))
  else:
    y_pred = model.predict(X_test)
  
  
  mses.append(mean_squared_error(y_test, y_pred)) 
  maes.append(mean_absolute_error(y_test, y_pred))
  r2s.append(r2_score(y_test, y_pred))
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])
  rmses.append(sqrt(abs(mean_squared_error(y_test, y_pred))))


results_list.append(np.mean(corrs))
results_list.append(abs(np.mean(maes)))
results_list.append(abs(np.mean(mses)))
results_list.append(sqrt(abs(np.mean(rmses))))
results_list.append(abs(np.mean(r2s)))

results_df.loc[len(results_df)] = results_list

Decision Tree Regression

from sklearn.tree import DecisionTreeRegressor

dataset = movies_be.copy()
cv_sets = 10
corrs = []
results_list = ['Decision Tree']
iterations = 100
normalize = True

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

model = DecisionTreeRegressor()
model.fit(X, y)

mse_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_squared_error').mean()
mae_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_absolute_error').mean()
r2_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'r2').mean()

for i in tqdm_notebook(range(iterations), total = iterations, unit = 'iteration'):
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  model = DecisionTreeRegressor()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])

results_list.append(np.mean(corrs))
results_list.append(abs(np.mean(maes)))
results_list.append(abs(np.mean(mses)))
results_list.append(sqrt(abs(np.mean(rmses))))
results_list.append(abs(np.mean(r2s)))

results_df.loc[len(results_df)] = results_list

HBox(children=(IntProgress(value=0), HTML(value='')))

Random Forest

from sklearn.ensemble import RandomForestRegressor

dataset = movies_be.copy()
cv_sets = 10
corrs = []
results_list = ['Random Forest']
iterations = 1
normalize = True
estimators = 100

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

model = RandomForestRegressor(n_estimators = estimators)
model.fit(X, y)

mse_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_squared_error').mean().round(4)
mae_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'neg_mean_absolute_error').mean().round(4)
r2_mean = cross_val_score(model, X, y, cv = cv_sets, scoring = 'r2').mean().round(4)

for i in tqdm_notebook(range(iterations), total = iterations, unit = 'iteration'):
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  model = RandomForestRegressor(n_estimators = estimators)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])

print('Corr: \t' + str((sum(corrs)/iterations).round(4)))
print('MAE: \t'+ str(abs(mae_mean).round(4)))
print('MSE: \t' + str(abs(mse_mean).round(4))) 
print('RMSE: \t' + str(sqrt(abs(mse_mean)).round(4)))
print('R2: \t' + str(abs(r2_mean).round(4)))

#print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), dataset.columns), reverse=True))

results_list.append(sum(corrs)/iterations)
results_list.append(abs(mae_mean))
results_list.append(abs(mse_mean))
results_list.append(sqrt(abs(mse_mean)))
results_list.append(abs(r2_mean))
results_df.loc[len(results_df)] = results_list

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))



Corr: 	0.8688
MAE: 	0.0317
MSE: 	0.0025
RMSE: 	0.05
R2: 	0.7128

feat_importances = pd.Series(model.feature_importances_, index = X.columns)
fig = feat_importances.nsmallest(len(X.columns)).plot(kind='barh').get_figure()
plt.tight_layout()
#fig.savefig('feat_importances.png', format='png', dpi=1200)
#files.download("feat_importances.png")

Ridge Regression

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

dataset = movies_be.copy()

cv_sets = 10
normalize = False
results_list = ['Ridge']

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']
parameters = {'alpha' : [1e-15, 1e-10, 1e-8, 18-4, 1e-3, 1e-2, 1, 5, 10, 20]}
scoring = {'MAE': 'neg_mean_absolute_error', 'MSE': 'neg_mean_squared_error', 'R2' : 'r2_score'}

model = Ridge()

regressor_mse = GridSearchCV(model, parameters, scoring = 'neg_mean_squared_error', cv = cv_sets)
regressor_mse.fit(X, y)

regressor_mae = GridSearchCV(model, parameters, scoring = 'neg_mean_absolute_error', cv = cv_sets)
regressor_mae.fit(X, y)

regressor_r2 = GridSearchCV(model, parameters, scoring = 'r2', cv = cv_sets)
regressor_r2.fit(X, y)

results_list.append(np.nan)
results_list.append(abs(np.mean(maes)))
results_list.append(abs(np.mean(mses)))
results_list.append(sqrt(abs(np.mean(rmses))))
results_list.append(abs(np.mean(r2s)))

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

dataset = movies_be.copy()
iterations = 100
corrs = []

cv_sets = 10
normalize = True

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']
parameters = {'alpha' : [1e-15, 1e-10, 1e-8, 18-4, 1e-3, 1e-2, 1, 5, 10, 20]}
scoring_metrics = {'MAE': 'neg_mean_absolute_error', 'MSE': 'neg_mean_squared_error', 'R2' : 'r2'}

for i in tqdm_notebook(range(iterations), total = iterations, unit = 'iteration'):
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  model = Ridge()
  regressor = GridSearchCV(model, parameters, cv = cv_sets)
  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])

results_list[1] = sum(corrs)/iterations
results_df.loc[len(results_df)] = results_list

HBox(children=(IntProgress(value=0), HTML(value='')))

Lasso Regression

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

dataset = movies_be.copy()

cv_sets = 10
normalize = True
results_list = ['Lasso']

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']
parameters = {'alpha' : [1e-15, 1e-10, 1e-8, 18-4, 1e-3, 1e-2, 1, 5, 10, 20]}

#model = Lasso(tol=1)
model = Lasso()

regressor_mse = GridSearchCV(model, parameters, scoring = 'neg_mean_squared_error', cv = cv_sets)
regressor_mse.fit(X, y)

regressor_mae = GridSearchCV(model, parameters, scoring = 'neg_mean_absolute_error', cv = cv_sets)
regressor_mae.fit(X, y)

regressor_r2 = GridSearchCV(model, parameters, scoring = 'r2', cv = cv_sets)
regressor_r2.fit(X, y)

results_list.append(np.nan)
results_list.append(abs(np.mean(maes)))
results_list.append(abs(np.mean(mses)))
results_list.append(sqrt(abs(np.mean(rmses))))
results_list.append(abs(np.mean(r2s)))

To calculate correlation. Don't run again please. The results are almost always the same

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

dataset = movies_be.copy()
iterations = 100
corrs = []

cv_sets = 10
normalize = True

if normalize:
    for column in dataset.columns:
      dataset[column] = ((dataset[column]) - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

X = dataset.loc[:, dataset.columns != 'revenue']
y = dataset['revenue']
parameters = {'alpha' : [1e-15, 1e-10, 1e-8, 18-4, 1e-3, 1e-2, 1, 5, 10, 20]}
scoring_metrics = {'MAE': 'neg_mean_absolute_error', 'MSE': 'neg_mean_squared_error', 'R2' : 'r2'}

for i in tqdm_notebook(range(iterations), total = iterations, unit = 'iteration'):
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  model = Lasso()
  regressor = GridSearchCV(model, parameters, cv = cv_sets)
  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)
  corrs.append(np.corrcoef(y_test, y_pred)[0, 1])

results_list[1] = sum(corrs)/iterations
results_df.loc[len(results_df)] = results_list

HBox(children=(IntProgress(value=0), HTML(value='')))

results_df

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Algorithm	Correlation	MAE	MSE	RMSE	R2
0	Linear Regression	0.848596	0.032815	0.002603	0.051022	0.702250
1	Support Vector	0.840365	0.043034	0.003219	0.238074	0.650731
2	Decision Tree	0.752134	0.043034	0.003219	0.238074	0.650731
3	Random Forest	0.868772	0.031700	0.002500	0.050000	0.712800
4	Ridge	0.854340	0.043034	0.003219	0.238074	0.650731
5	Lasso	0.851810	0.043034	0.003219	0.238074	0.650731

pos = list(range(len(results_df['Correlation']))) 
width = 0.25 
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(10,5))

rects1 = plt.bar(pos, results_df['Correlation'], width, color = '#ff0000', label = results_df['Algorithm'][0])
rects2 = plt.bar([p + width for p in pos], results_df['R2'], width, color = '#32a852', label = results_df['Algorithm'][1]) 

plt.bar(pos, results_df['Correlation'], width, color = '#ff0000', label = results_df['Algorithm'][0])
plt.bar([p + width for p in pos], results_df['R2'], width, color = '#32a852', label = results_df['Algorithm'][1]) 

#ax.set_ylabel('Score')
#ax.set_title('Correlation and R-squared comparison')
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(results_df['Algorithm'])

plt.xlim(min(pos)-width, max(pos)+width*4)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
plt.ylim([0.6, 1])
#plt.xticks(rotation = 45)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height().round(3)
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
sns.despine(top = True, right = True)
plt.legend(['Correlation', 'R-squared'], loc='upper left')
plt.show()

#fig.savefig('corrandr2.png', format = 'png', dpi = 1200)
#files.download('corrandr2.png')

pos = list(range(len(results_df['MAE']))) 
width = 0.25 
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(10,5))

rects1 = plt.bar(pos, results_df['MAE'], width, color = '#ffa500', label = results_df['Algorithm'][0])
rects2 = plt.bar([p + width for p in pos], results_df['RMSE'], width, color = '#34a1eb', label = results_df['Algorithm'][1]) 

plt.bar(pos, results_df['MAE'], width, color = '#ffa500', label = results_df['Algorithm'][0])
plt.bar([p + width for p in pos], results_df['RMSE'], width, color = '#34a1eb', label = results_df['Algorithm'][1]) 
#ax.set_ylabel('Score')
#ax.set_title('Error metrics comparison')
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(results_df['Algorithm'])

plt.xlim(min(pos)-width, max(pos)+width*4)
plt.ylim([0, max(results_df['MAE'] + results_df['RMSE'])] )
#plt.xticks(rotation = 45)
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height().round(3)
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
sns.despine(top = True, right = True)
plt.legend(['MAE', 'RMSE'], loc='upper left')
plt.show()

blazyy / movie-revenue-prediction Goto Github PK

movie-revenue-prediction's Introduction

Initialization

Data Cleaning & Preprocessing

Exploratory Data Analysis & Feature Selection

Backward Elimination

Feature Selection Stuff

Select k best / Univariate Selection

Feature Importance

Machine Learning

Linear Regression

Support Vector Regression

Decision Tree Regression

Random Forest

Ridge Regression

Lasso Regression

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs