Kaggle Dataset

Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.

The training data set, (train.csv), has 785 columns. The first column, called "label", is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.

Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).

In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import timeit
/home/onepanel/.conda/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [14]:
train = pd.read_csv("train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X/255.,y,test_size=0.1,random_state=0)
In [15]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)
(42000, 784)
(37800, 784)
(4200, 784)

Random Forest

In [17]:
import time
start=time.time()
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("random forest accuracy: ",acc_rf)
end=time.time()
print(end-start)
random forest accuracy:  0.9366666666666666
2.6753814220428467
In [22]:
estimator=clf_rf.estimators_[1]
feature_names=list(range(784))
class_names=list(range(10))

f_names=[]
for i in feature_names:
    f_names.append(str(i))
#class_names=str(class_names)

c_names=[]
for i in class_names:
    c_names.append(str(i))



print(len(f_names))
print(len(c_names))
784
10

SGD Classifier

In [26]:
import time
start=time.time()
clf_sgd = SGDClassifier()
clf_sgd.fit(X_train, y_train)
y_pred_sgd = clf_sgd.predict(X_test)
acc_sgd = accuracy_score(y_test, y_pred_sgd)
print("stochastic gradient descent accuracy: ",acc_sgd)
end=time.time()
print(end-start)
/home/onepanel/.conda/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
stochastic gradient descent accuracy:  0.8876190476190476
2.553283452987671

SVM

In [27]:
import time
start=time.time()
clf_svm = LinearSVC()
clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("Linear SVM accuracy: ",acc_svm)
end=time.time()
print(end-start)
Linear SVM accuracy:  0.91
48.14553713798523

KNN

In [28]:
import time
start=time.time()
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("nearest neighbors accuracy: ",acc_knn)
end=time.time()
print(end-start)
nearest neighbors accuracy:  0.9666666666666667
246.23137283325195
In [30]:
import numpy as np
import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
In [31]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)
In [45]:
param_grid = {"min_samples_split": [2, 10],
              "criterion": ["gini", "entropy"]}
In [47]:
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
start = time.time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time.time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
GridSearchCV took 134.05 seconds for 4 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.954 (std: 0.001)
Parameters: {'criterion': 'entropy', 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.953 (std: 0.002)
Parameters: {'criterion': 'gini', 'min_samples_split': 2}

Model with rank: 3
Mean validation score: 0.953 (std: 0.001)
Parameters: {'criterion': 'entropy', 'min_samples_split': 10}

In [38]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
In [48]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 4
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5)

start = time.time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
RandomizedSearchCV took 42.43 seconds for 4 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.944 (std: 0.001)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_split': 4}

Model with rank: 2
Mean validation score: 0.939 (std: 0.002)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_split': 2}

Model with rank: 3
Mean validation score: 0.938 (std: 0.002)
Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_split': 3}

Bayesian Optimization

In [50]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
In [52]:
from sklearn.model_selection import cross_val_score
def acc_model(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

param_space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,20)),
    'criterion': hp.choice('criterion', ["gini", "entropy"])}

best = 0
def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    print ('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=trials)
print ('best:')
print (best)
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 13, 'max_features': 14}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 16}
new best: 0.9282014151604457 {'criterion': 'gini', 'max_depth': 2, 'max_features': 15}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 9, 'max_features': 6}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 9, 'max_features': 4}
new best: 0.9282014151604457 {'criterion': 'gini', 'max_depth': 15, 'max_features': 2}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 12}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 6}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 11, 'max_features': 18}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 3, 'max_features': 18}
new best: 0.9282014151604457 {'criterion': 'gini', 'max_depth': 9, 'max_features': 13}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 13, 'max_features': 5}
new best: 0.9282014151604457 {'criterion': 'gini', 'max_depth': 18, 'max_features': 4}
new best: 0.9282014151604457 {'criterion': 'entropy', 'max_depth': 12, 'max_features': 15}
new best: 0.9294445893695121 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9294445893695121 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 16}
new best: 0.9294445893695121 {'criterion': 'gini', 'max_depth': 10, 'max_features': 9}
new best: 0.9294445893695121 {'criterion': 'gini', 'max_depth': 2, 'max_features': 16}
new best: 0.9294445893695121 {'criterion': 'entropy', 'max_depth': 11, 'max_features': 12}
new best: 0.9294445893695121 {'criterion': 'gini', 'max_depth': 19, 'max_features': 13}
new best: 0.9305026876332022 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 7}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 1, 'max_features': 3}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 6, 'max_features': 11}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 10}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 1}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 7, 'max_features': 17}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 8}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 14}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 5, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 5, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 15, 'max_features': 2}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 18, 'max_features': 10}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 1, 'max_features': 5}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 17}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 3, 'max_features': 6}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 13, 'max_features': 3}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 19, 'max_features': 1}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 6, 'max_features': 15}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 12, 'max_features': 8}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 16, 'max_features': 7}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 7, 'max_features': 4}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 9, 'max_features': 9}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 18}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 10, 'max_features': 11}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 2, 'max_features': 14}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 11, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 16, 'max_features': 12}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 16}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 15, 'max_features': 13}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 3, 'max_features': 2}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 14, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 13, 'max_features': 6}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 18, 'max_features': 15}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 9}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 14, 'max_features': 18}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 12, 'max_features': 5}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 10, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 9, 'max_features': 4}
new best: 0.9318250442274568 {'criterion': 'gini', 'max_depth': 19, 'max_features': 7}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 2, 'max_features': 3}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 1, 'max_features': 10}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 17}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 5, 'max_features': 19}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 1}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 7, 'max_features': 8}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 6, 'max_features': 12}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 16, 'max_features': 11}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 11, 'max_features': 14}
new best: 0.9318250442274568 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 13}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 19}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 16}
new best: 0.9323279207713688 {'criterion': 'gini', 'max_depth': 17, 'max_features': 19}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 15, 'max_features': 5}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 2}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 3, 'max_features': 6}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 19}
new best: 0.9323279207713688 {'criterion': 'gini', 'max_depth': 13, 'max_features': 15}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 10}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 19, 'max_features': 9}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 18, 'max_features': 4}
new best: 0.9323279207713688 {'criterion': 'gini', 'max_depth': 12, 'max_features': 3}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 8, 'max_features': 19}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 1, 'max_features': 18}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 9, 'max_features': 7}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 10, 'max_features': 17}
new best: 0.9323279207713688 {'criterion': 'gini', 'max_depth': 5, 'max_features': 1}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 2, 'max_features': 8}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 4, 'max_features': 11}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 17, 'max_features': 12}
new best: 0.9323279207713688 {'criterion': 'gini', 'max_depth': 6, 'max_features': 19}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 14, 'max_features': 14}
new best: 0.9323279207713688 {'criterion': 'entropy', 'max_depth': 11, 'max_features': 13}
best:
{'criterion': 1, 'max_depth': 16, 'max_features': 18}

AutoKeras

In [57]:
from keras.datasets import mnist
from autokeras import ImageClassifier

# loadning mnist from keras
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# each image has to 3D: 2 coordinates, 1 value (gray scale)
X_train = X_train.reshape(X_train.shape + (1,))
X_test = X_test.reshape(X_test.shape + (1,))
Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
11493376/11490434 [==============================] - 1s 0us/step
In [58]:
# initialize the classifier
# data augmentation set to True
clf = ImageClassifier(verbose=True, augment=True)
In [ ]:
# fit the classifier to the dataset
# nb!: default time limit is 24 h!
# for this example 1 h should be enough
clf.fit(X_train, y_train, time_limit=(60*10))
In [ ]: