Use cross_validation to split train/test data
See following link:
http://scikit-learn.org/0.17/modules/cross\_validation.html
Evaluation Metrics
评估指标
Classification Metrics
accuracy = number of correctly identified instances / all instances
In sklearn, .score()
#
# In this and the following exercises, you'll be adding train test splits to the data
# to see how it changes the performance of each classifier
#
# The code provided will load the Titanic dataset like you did in project 0, then train
# a decision tree (the method you used in your project) and a Bayesian classifier (as
# discussed in the introduction videos). You don't need to worry about how these work for
# now.
#
# What you do need to do is import a train/test split, train the classifiers on the
# training data, and store the resulting accuracy scores in the dictionary provided.
import numpy as np
import pandas as pd
# Load the dataset
X = pd.read_csv('titanic_data.csv')
# Limit to numeric data
X = X._get_numeric_data()
# Separate the labels
y = X['Survived']
# Remove labels from the inputs, and age due to missing data
del X['Age'], X['Survived']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
# The decision tree classifier
clf1 = DecisionTreeClassifier()
clf1.fit(X_train,y_train)
print "Decision Tree has accuracy: ",accuracy_score(y_test, clf1.predict(X_test))
# The naive Bayes classifier
clf2 = GaussianNB()
clf2.fit(X_train,y_train)
print "GaussianNB has accuracy: ",accuracy_score(y_test, clf2.predict(X_test))
answer = {
"Naive Bayes Score": 0.638655462185,
"Decision Tree Score": 0.672268907563
}
使用准确率的问题:
- not ideal for skewed classes
- may want to error on side of guessing innocent
- may want to error on side of guessing guilty
Confusion Matrix
混淆矩阵
predictual class/actual calss | positive | negative |
---|---|---|
positive | false positives | |
negative | false negatives |
# In this exercise, we'll use the Titanic dataset as before, train two classifiers and
# look at their confusion matrices. Your job is to create a train/test split in the data
# and report the results in the dictionary at the bottom.
import numpy as np
import pandas as pd
# Load the dataset
from sklearn import datasets
X = pd.read_csv('titanic_data.csv')
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
# TODO: split the data into training and testing sets,
# using the default settings for train_test_split (or test_size = 0.25 if specified).
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)
clf1 = DecisionTreeClassifier()
clf1.fit(X_train,y_train)
print "Confusion matrix for this Decision Tree:\n",confusion_matrix(y_test,clf1.predict(X_test))
clf2 = GaussianNB()
clf2.fit(X,y)
print "GaussianNB confusion matrix:\n",confusion_matrix(y_test,clf2.predict(X_test))
#TODO: store the confusion matrices on the test sets below
confusions = {
"Naive Bayes": [[122 17]
[ 48 36]],
"Decision Tree": [[100 39]
[ 39 45]]
}
Precision and Recall
recall 查全率 —— True Positive / (True Positive + False Negative)
precision 查准率 —— True Positive / (True Positive + False Positive)
For "Gerhard Schroeder":
True Positives = 14
False Postives = 1
False Negatives = 12
# As with the previous exercises, let's look at the performance of a couple of classifiers
# on the familiar Titanic dataset. Add a train/test split, then store the results in the
# dictionary provided.
import numpy as np
import pandas as pd
# Load the dataset
X = pd.read_csv('titanic_data.csv')
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)
clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf1.predict(X_test)),precision(y_test,clf1.predict(X_test)))
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf2.predict(X_test)),precision(y_test,clf2.predict(X_test)))
results = {
"Naive Bayes Recall": 0.43,
"Naive Bayes Precision": 0.68,
"Decision Tree Recall": 0.56,
"Decision Tree Precision": 0.57
}
F1 Score
# As usual, use a train/test split to get a reliable F1 score from two classifiers, and
# save it the scores in the provided dictionaries.
import numpy as np
import pandas as pd
# Load the dataset
X = pd.read_csv('titanic_data.csv')
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)
clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
print "Decision Tree F1 score: {:.2f}".format(f1_score(y_test, clf1.predict(X_test)))
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
print "GaussianNB F1 score: {:.2f}".format(f1_score(y_test, clf2.predict(X_test)))
F1_scores = {
"Naive Bayes": 0.54,
"Decision Tree": 0.53
}
Regression metrics
Mean Absolute Error
import numpy as np
import pandas as pd
# Load the dataset
from sklearn.datasets import load_linnerud
linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)
reg1 = DecisionTreeRegressor()
reg1.fit(X_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mae(y_test,reg1.predict(X_test)))
reg2 = LinearRegression()
reg2.fit(X_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mae(y_test,reg2.predict(X_test)))
results = {
"Linear Regression": 11.45,
"Decision Tree": 17.27
}
Mean Squared Error
import numpy as np
import pandas as pd
# Load the dataset
from sklearn.datasets import load_linnerud
linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)
reg1 = DecisionTreeRegressor()
reg1.fit(X_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mse(y_test, reg1.predict(X_test)))
reg2 = LinearRegression()
reg2.fit(X_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mse(y_test, reg2.predict(X_test)))
results = {
"Linear Regression": 380.46,
"Decision Tree": 1087.27
}
Regression Scoring Functions
0 is bad , 1is perfect
one of these functions is the R2 score
The other is explained variance score