Use cross_validation to split train/test data

See following link:\_validation.html

Evaluation Metrics


Classification Metrics

accuracy = number of correctly identified instances / all instances

In sklearn, .score()

# In this and the following exercises, you'll be adding train test splits to the data
# to see how it changes the performance of each classifier
# The code provided will load the Titanic dataset like you did in project 0, then train
# a decision tree (the method you used in your project) and a Bayesian classifier (as
# discussed in the introduction videos). You don't need to worry about how these work for
# now. 
# What you do need to do is import a train/test split, train the classifiers on the
# training data, and store the resulting accuracy scores in the dictionary provided.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('titanic_data.csv')
# Limit to numeric data
X = X._get_numeric_data()
# Separate the labels
y = X['Survived']
# Remove labels from the inputs, and age due to missing data
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)

# The decision tree classifier
clf1 = DecisionTreeClassifier(),y_train)
print "Decision Tree has accuracy: ",accuracy_score(y_test, clf1.predict(X_test))
# The naive Bayes classifier

clf2 = GaussianNB(),y_train)
print "GaussianNB has accuracy: ",accuracy_score(y_test, clf2.predict(X_test))

answer = { 
 "Naive Bayes Score": 0.638655462185, 
 "Decision Tree Score": 0.672268907563


  1. not ideal for skewed classes
  2. may want to error on side of guessing innocent
  3. may want to error on side of guessing guilty

Confusion Matrix


predictual class/actual calss positive negative
positive false positives
negative false negatives
# In this exercise, we'll use the Titanic dataset as before, train two classifiers and
# look at their confusion matrices. Your job is to create a train/test split in the data
# and report the results in the dictionary at the bottom.

import numpy as np
import pandas as pd

# Load the dataset
from sklearn import datasets

X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the default settings for train_test_split (or test_size = 0.25 if specified).
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)

clf1 = DecisionTreeClassifier(),y_train)
print "Confusion matrix for this Decision Tree:\n",confusion_matrix(y_test,clf1.predict(X_test))

clf2 = GaussianNB(),y)
print "GaussianNB confusion matrix:\n",confusion_matrix(y_test,clf2.predict(X_test))

#TODO: store the confusion matrices on the test sets below

confusions = {
 "Naive Bayes": [[122  17] 
                 [ 48  36]],
 "Decision Tree": [[100  39]
                   [ 39  45]]

Precision and Recall

recall 查全率 —— True Positive / (True Positive + False Negative)

precision 查准率 —— True Positive / (True Positive + False Positive)

For "Gerhard Schroeder":

True Positives = 14

False Postives = 1

False Negatives = 12

# As with the previous exercises, let's look at the performance of a couple of classifiers
# on the familiar Titanic dataset. Add a train/test split, then store the results in the
# dictionary provided.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)

clf1 = DecisionTreeClassifier(), y_train)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf1.predict(X_test)),precision(y_test,clf1.predict(X_test)))

clf2 = GaussianNB(), y_train)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf2.predict(X_test)),precision(y_test,clf2.predict(X_test)))

results = {
  "Naive Bayes Recall": 0.43,
  "Naive Bayes Precision": 0.68,
  "Decision Tree Recall": 0.56,
  "Decision Tree Precision": 0.57

F1 Score

F1=2(precisionrecall)/(precision+recall)F1 = 2 * (precision * recall) / (precision + recall)

# As usual, use a train/test split to get a reliable F1 score from two classifiers, and
# save it the scores in the provided dictionaries.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)

clf1 = DecisionTreeClassifier(), y_train)
print "Decision Tree F1 score: {:.2f}".format(f1_score(y_test, clf1.predict(X_test)))

clf2 = GaussianNB(), y_train)
print "GaussianNB F1 score: {:.2f}".format(f1_score(y_test, clf2.predict(X_test)))

F1_scores = {
 "Naive Bayes": 0.54,
 "Decision Tree": 0.53

Regression metrics

Mean Absolute Error

import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X =
y =

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)

reg1 = DecisionTreeRegressor(), y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mae(y_test,reg1.predict(X_test)))

reg2 = LinearRegression(), y_train)
print "Linear regression mean absolute error: {:.2f}".format(mae(y_test,reg2.predict(X_test)))

results = {
 "Linear Regression": 11.45,
 "Decision Tree": 17.27

Mean Squared Error

import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X =
y =

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)

reg1 = DecisionTreeRegressor(), y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mse(y_test, reg1.predict(X_test)))

reg2 = LinearRegression(), y_train)
print "Linear regression mean absolute error: {:.2f}".format(mse(y_test, reg2.predict(X_test)))

results = {
 "Linear Regression": 380.46,
 "Decision Tree": 1087.27

Regression Scoring Functions

0 is bad , 1is perfect

one of these functions is the R2 score

The other is explained variance score

results matching ""

    No results matching ""