Import libraries necessary for this project

import numpy as np import pandas as pd from time import time from IPython.display import display # Allows the use of display() for DataFrames

Import supplementary visualization code visuals.py

import visuals as vs

Pretty display for notebooks

%matplotlib inline

Load the Census dataset

data = pd.read_csv("census.csv")

Success - Display the first record

display(data.head(n=1))

TODO: Total number of records

n_records = len(data)

TODO: Number of records where individual's income is more than $50,000

n_greater_50k = (data['income']>50K).sum()

TODO: Number of records where individual's income is at most $50,000

n_at_most_50k = n_records-n_greater_50k

TODO: Percentage of individuals whose income is more than $50,000

greater_percent = (n_greater_50k)*100/(n_records)

Print the results

print("Total number of records: {}".format(n_records)) print("Individuals making more than $50,000: {}".format(n_greater_50k)) print("Individuals making at most $50,000: {}".format(n_at_most_50k)) print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))

Split the data into features and target label

income_raw = data['income'] features_raw = data.drop('income', axis = 1)

Visualize skewed continuous features of original data

vs.distribution(data)

Log-transform the skewed features

skewed = ['capital-gain', 'capital-loss'] features_log_transformed = pd.DataFrame(data = features_raw) features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

Visualize the new log distributions

vs.distribution(features_log_transformed, transformed = True)

Import sklearn.preprocessing.StandardScaler

from sklearn.preprocessing import MinMaxScaler

Initialize a scaler, then apply it to the features

scaler = MinMaxScaler() # default=(0, 1) numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed) features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

Show an example of a record with scaling applied

display(features_log_minmax_transform.head(n = 5))

TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()

features_final = pd.get_dummies(features_log_minmax_transform)

TODO: Encode the 'income_raw' data to numerical values

income = income_raw.apply(lambda x:1 if x==">50K" else 0)

Print the number of features after one-hot encoding

encoded = list(features_final.columns) print("{} total features after one-hot encoding.".format(len(encoded)))

Uncomment the following line to see the encoded feature names

print encoded

Import train_test_split

from sklearn.cross_validation import train_test_split

Split the 'features' and 'income' data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features_final, income, test_size = 0.2, random_state = 0)

Show the results of the split

print("Training set has {} samples.".format(X_train.shape[0])) print("Testing set has {} samples.".format(X_test.shape[0])) ''' TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data encoded to numerical values done in the data preprocessing step. FP = income.count() - TP # Specific to the naive case TN = 0 # No predicted negatives in the naive case FN = 0 # No predicted negatives in the naive case '''

TODO: Calculate accuracy, precision and recall

accuracy = n_greater_50k / n_records recall = n_greater_50k/(n_greater_50k+0) precision = n_at_most_50k/(n_greater_50k+n_at_most_50k)

TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.

fscore = (1+0.50.5)(precisionrecall/((0.50.5*precision)+recall))

Print the results

print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

TODO: Import two metrics from sklearn - fbeta_score and accuracy_score

from sklearn.metrics import fbeta_score,accuracy_score def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set '''

results = {}

# TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
start = time() # Get start time
learner = learner.fit(X_train[:sample_size],y_train[sample_size])
end = time() # Get end time

# TODO: Calculate the training time
results['train_time'] = end-start
    
# TODO: Get the predictions on the test set(X_test),
#       then get predictions on the first 300 training samples(X_train) using .predict()
start = time() # Get start time
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train[:300])
end = time() # Get end time

# TODO: Calculate the total prediction time
results['pred_time'] = end-start
        
# TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
results['acc_train'] = accuracy_score(y_train[:300],predictions_train)
    
# TODO: Compute accuracy on test set using accuracy_score()
results['acc_test'] = accuracy_score(y_test,predictions_test)

# TODO: Compute F-score on the the first 300 training samples using fbeta_score()
results['f_train'] = fbeta_score(y_train[:300],predictions_train,0.5)
    
# TODO: Compute F-score on the test set which is y_test
results['f_test'] = fbeta_score(y_test,predictions_test,0.5)
   
# Success
print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    
# Return the results
return results
# TODO: Import the three supervised learning models from sklearn

from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier

TODO: Initialize the three models

clf_A = DecisionTreeClassifier() clf_B = SVC() clf_C = AdaBoostClassifier()

TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data

HINT: samples_100 is the entire training set i.e. len(y_train)

HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)

HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)

samples_100 = len(y_train) samples_10 = int(round(len(y_train)/10)) samples_1 = int(round(len(y_train)/100))

Collect results on the learners

results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.class.name results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] =
train_predict(clf, samples, X_train, y_train, X_test, y_test)

Run metrics visualization for the three supervised learning models chosen

vs.evaluate(results, accuracy, fscore)

TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries

from sklearn.grid_search import GridSearchCV from sklearn.metrics import make_scorer

TODO: Initialize the classifier

clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

TODO: Create the parameters list you wish to tune, using a dictionary if needed.

HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}

parameters = {'n_estimators':[50, 120], 'learning_rate':[0.1, 0.5, 1.],'base_estimator__min_samples_split' : np.arange(2, 8, 2),'base_estimator__max_depth' : np.arange(1, 4, 1)}

TODO: Make an fbeta_score scoring object using make_scorer()

scorer = make_scorer(fbeta_score,beta=0.5)

TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()

grid_obj = GridSearchCV(clf,parameters,scorer)

TODO: Fit the grid search object to the training data and find the optimal parameters using fit()

grid_fit = grid_obj.fit(X_train,y_train)

Get the estimator

best_clf = grid_fit.best_estimator_

Make predictions using the unoptimized and model

predictions = (clf.fit(X_train, y_train)).predict(X_test) best_predictions = best_clf.predict(X_test)

Report the before-and-afterscores

print("Unoptimized model\n------") print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))) print("\nOptimized Model\n------") print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

TODO: Import a supervised learning model that has 'feature_importances_'

TODO: Train the supervised model on the training set using .fit(X_train, y_train)

model = AdaBoostClassifier().fit(X_train,y_train)

TODO: Extract the feature importances using .feature_importances_

importances = model.feature_importances_

Plot

vs.feature_plot(importances, X_train, y_train)

Import functionality for cloning a model

from sklearn.base import clone

Reduce the feature space

X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]

Train on the "best" model found from grid search earlier

clf = (clone(best_clf)).fit(X_train_reduced, y_train)

Make new predictions

reduced_predictions = clf.predict(X_test_reduced)

Report scores from the final model using both versions of data

print("Final Model trained on full data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))) print("\nFinal Model trained on reduced data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))

vineeth-raj / charity-ml Goto Github PK

charity-ml's Introduction

Import libraries necessary for this project

Import supplementary visualization code visuals.py

Pretty display for notebooks

Load the Census dataset

Success - Display the first record

TODO: Total number of records

TODO: Number of records where individual's income is more than $50,000

TODO: Number of records where individual's income is at most $50,000

TODO: Percentage of individuals whose income is more than $50,000

Print the results

Split the data into features and target label

Visualize skewed continuous features of original data

Log-transform the skewed features

Visualize the new log distributions

Import sklearn.preprocessing.StandardScaler

Initialize a scaler, then apply it to the features

Show an example of a record with scaling applied

TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()

TODO: Encode the 'income_raw' data to numerical values

Print the number of features after one-hot encoding

Uncomment the following line to see the encoded feature names

print encoded

Import train_test_split

Split the 'features' and 'income' data into training and testing sets

Show the results of the split

TODO: Calculate accuracy, precision and recall

TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.

Print the results

TODO: Import two metrics from sklearn - fbeta_score and accuracy_score

TODO: Initialize the three models

TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data

HINT: samples_100 is the entire training set i.e. len(y_train)

HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be int and not float)

HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be int and not float)

Collect results on the learners

Run metrics visualization for the three supervised learning models chosen

TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries

TODO: Initialize the classifier

TODO: Create the parameters list you wish to tune, using a dictionary if needed.

HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}

TODO: Make an fbeta_score scoring object using make_scorer()

TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()

TODO: Fit the grid search object to the training data and find the optimal parameters using fit()

Get the estimator

Make predictions using the unoptimized and model

Report the before-and-afterscores

TODO: Import a supervised learning model that has 'feature_importances_'

TODO: Train the supervised model on the training set using .fit(X_train, y_train)

TODO: Extract the feature importances using .feature_importances_

Plot

Import functionality for cloning a model

Reduce the feature space

Train on the "best" model found from grid search earlier

Make new predictions

Report scores from the final model using both versions of data

charity-ml's People

Contributors

Stargazers

Recommend Projects

Recommend Topics

Recommend Org

Jobs

HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)

HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)