Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.
The training data set, (train.csv), has 785 columns. The first column, called "label", is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.
Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import timeit
train = pd.read_csv("train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X/255.,y,test_size=0.1,random_state=0)
print(X.shape)
print(X_train.shape)
print(X_test.shape)
import time
start=time.time()
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("random forest accuracy: ",acc_rf)
end=time.time()
print(end-start)
estimator=clf_rf.estimators_[1]
feature_names=list(range(784))
class_names=list(range(10))
f_names=[]
for i in feature_names:
f_names.append(str(i))
#class_names=str(class_names)
c_names=[]
for i in class_names:
c_names.append(str(i))
print(len(f_names))
print(len(c_names))
import time
start=time.time()
clf_sgd = SGDClassifier()
clf_sgd.fit(X_train, y_train)
y_pred_sgd = clf_sgd.predict(X_test)
acc_sgd = accuracy_score(y_test, y_pred_sgd)
print("stochastic gradient descent accuracy: ",acc_sgd)
end=time.time()
print(end-start)
import time
start=time.time()
clf_svm = LinearSVC()
clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("Linear SVM accuracy: ",acc_svm)
end=time.time()
print(end-start)
import time
start=time.time()
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("nearest neighbors accuracy: ",acc_knn)
end=time.time()
print(end-start)
import numpy as np
import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# build a classifier
clf = RandomForestClassifier(n_estimators=20)
param_grid = {"min_samples_split": [2, 10],
"criterion": ["gini", "entropy"]}
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
start = time.time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time.time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 4
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search, cv=5)
start = time.time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def acc_model(params):
clf = RandomForestClassifier(**params)
return cross_val_score(clf, X_train, y_train).mean()
param_space = {
'max_depth': hp.choice('max_depth', range(1,20)),
'max_features': hp.choice('max_features', range(1,20)),
'criterion': hp.choice('criterion', ["gini", "entropy"])}
best = 0
def f(params):
global best
acc = acc_model(params)
if acc > best:
best = acc
print ('new best:', best, params)
return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=trials)
print ('best:')
print (best)
from keras.datasets import mnist
from autokeras import ImageClassifier
# loadning mnist from keras
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# each image has to 3D: 2 coordinates, 1 value (gray scale)
X_train = X_train.reshape(X_train.shape + (1,))
X_test = X_test.reshape(X_test.shape + (1,))
# initialize the classifier
# data augmentation set to True
clf = ImageClassifier(verbose=True, augment=True)
# fit the classifier to the dataset
# nb!: default time limit is 24 h!
# for this example 1 h should be enough
clf.fit(X_train, y_train, time_limit=(60*10))