Contents
- Introduction
- Quick models using fast.ai on EMNIST
- Auto-Keras on EMNIST
- TPOT on EMNIST
- Comparison of Results
Introduction
EMNIST [1] is another MNIST-like dataset similar to the original MNIST [2], Fashion-MNIST [3] and Kuzushiji-MNIST [4]. Similar to other MNIST-like datasets, such as Kuzushiji-MNIST, EMNIST is not a single dataset but consists of 6 datasets containing various classes of letters and digits. Some class distributions are balanced, others are not.
If you like MNIST-like datasets, then have a look at CMNIST as well.
EMNIST-By_Class
EMNIST-By_Class consists of 62 classes containing 814255 samples.

Let’s have a look at some examples:

EMNIST-By_Merge
EMNIST-By_Merge consists of 47 classes containing 814255 samples.

Let’s have a look at some examples here:

EMNIST-Balanced
EMNIST-Balanced consists of 47 classes containing 131600 samples.

Let’s have a look at some examples:

EMNIST-Letters
EMNIST-Letters consists of 26 classes containing 145600 samples.

Let’s have a look at some examples of this dataset:

EMNIST-Digits
EMNIST-Digits consists of 10 classes containing 280000 samples.

Let’s have a look at some examples of this dataset:

EMNIST-MNIST
EMNIST-Digits consists of 10 classes containing 70000 samples.

Let’s have a look at some examples of this dataset:

Quick models using fastai on EMNIST
Loading custom datasets for fastai is a fucking pain in the ass. Standard data generators for PyTorch hardly work, and all onboard solutions are quite messy and end with some hardly documented code. Therefore, we have to write a little function to dump EMNIST (sub) datasets to create a imagenet compatible dataset. Probably, there is some workaround, but this is the fastest thing that came to my mind.
# dumpImages.py #
import numpy as np
import pandas as pd
from mnist import MNIST
import gc
import os
from PIL import Image
from sklearn.model_selection import train_test_split
def dumpToImages(X,y,SetType,DatasetName,Path):
def saveImg(X,y,fullPath):
imgDict = {}
for i in range(len(X)):
imgTmp = np.asarray(X[i]).reshape(28,28)
imgTmp = np.stack((imgTmp,)*3, axis=-1)
imgTmp = imgTmp.astype(np.uint8)
img = Image.fromarray(imgTmp)
filePathAndName = fullPath+str(y[i])+"/"+str(i)+".png"
img.save(filePathAndName)
imgDict[i] = {"file": filePathAndName,
"label": y[i]}
print("Images dumped")
return imgDict
if not os.path.exists(Path+DatasetName):
os.makedirs(Path+"/"+DatasetName+"/")
fullPath = Path+DatasetName+"/"+SetType+"/"
y_unique = np.unique(y)
if SetType == "train":
if not os.path.exists(fullPath):
os.makedirs(fullPath)
for i in y_unique:
if not os.path.exists(fullPath+"/"+str(i)+"/"):
os.makedirs(fullPath+"/"+str(i)+"/")
result = saveImg(X,y,fullPath)
elif SetType == "valid":
if not os.path.exists(fullPath):
os.makedirs(fullPath)
for i in y_unique:
if not os.path.exists(fullPath+"/"+str(i)+"/"):
os.makedirs(fullPath+"/"+str(i)+"/")
result = saveImg(X,y,fullPath)
elif SetType == "test":
if not os.path.exists(fullPath):
os.makedirs(fullPath)
for i in y_unique:
if not os.path.exists(fullPath+"/"+str(i)+"/"):
os.makedirs(fullPath+"/"+str(i)+"/")
result = saveImg(X,y,fullPath)
return pd.DataFrame.from_dict(result, orient="index")
def dump_subset(subset,subsetName):
print("Dumping", subsetName)
mndata = MNIST("./data/orig")
mndata.select_emnist(subset)
X_train, y_train = mndata.load_training()
y_train = np.asarray(y_train)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2,shuffle=True, random_state=42)
X_test, y_test = mndata.load_testing()
y_test = np.asarray(y_test)
DF_train = dumpToImages(X_train, y_train, "train", subsetName,"./data/")
DF_train.to_csv("./data/"+subsetName+"_train.csv")
del X_train
del y_train
gc.collect()
DF_valid = dumpToImages(X_valid, y_valid, "valid", subsetName,"./data/")
DF_valid.to_csv("./data/"+subsetName+"_valid.csv")
del X_valid
del y_valid
gc.collect()
DF_test = dumpToImages(X_test, y_test, "test", subsetName,"./data/")
DF_test.to_csv("./data/"+subsetName+"_test.csv")
del X_test
del y_test
gc.collect()
print(subsetName," dumped")
def main():
subsets = ["mnist", "digits", "letters", "balanced", "bymerge", "byclass"]
subsetNames = ["EMNIST-MNIST","EMNIST-Digits", "EMNIST-Letters", "EMNIST-Balanced", "EMNIST-Bymerge", "EMNIST-Byclass"]
for i in range(len(subsets)):
dump_subset(subsets[i], subsetNames[i])
if __name__ == "__main__":
main()
I think manual, repetitive work is stupid, I wrote a little script to explore how some very basic models perform on EMNIST:
# train_test_fastai_models.py #
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import pickle
import random
from sklearn.model_selection import train_test_split
from PIL import Image
import PIL
import copy
import itertools
import sklearn.metrics as skmetrics
import torch
import torch.nn as nn
import fastai
from fastai import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.basic_train import *
from fastai.vision.learner import *
def training(subset):
train_df = pd.read_csv("./data/"+subset+"_train.csv", usecols=["file","label"])
valid_df = pd.read_csv("./data/"+subset+"_valid.csv", usecols=["file","label"])
train_valid_df= train_df.append(valid_df, ignore_index=True)
ds_tfms = get_transforms(do_flip=False, flip_vert=False, max_rotate=30, max_zoom=1.1, max_warp=0.3)
# ImageDataBunch.from_folder seems to be dysfunctional
data = ImageDataBunch.from_df("", train_valid_df, ds_tfms=ds_tfms, size=28, bs=1024)
print(data)
learn = cnn_learner(data, models.resnet18, metrics=accuracy)
learn.lr_find(start_lr=1e-6, end_lr=1e1, stop_div=True, num_it=200)
plt.figure()
learn.recorder.plot(suggestion=True)
plt.title("Optimal Learning Rate - "+subset)
plt.savefig("./graphics/"+subset+"_lr_selection.png")
plt.savefig("./graphics/"+subset+"_lr_selection.pdf")
plt.close()
lr = learn.recorder.min_grad_lr
print(lr)
learn.fit_one_cycle(10, lr)
plt.figure()
learn.recorder.plot_metrics()
plt.title("Training accuracies - "+subset)
plt.savefig("./graphics/"+subset+"_train_accs.png")
plt.savefig("./graphics/"+subset+"_train_accs.pdf")
plt.close()
plt.figure()
learn.recorder.plot_lr()
plt.title("Training Learning Rates - "+subset)
plt.savefig("./graphics/"+subset+"_train_lr.png")
plt.savefig("./graphics/"+subset+"_train_lr.pdf")
plt.close()
plt.figure()
learn.recorder.plot_losses()
plt.title("Training Losses - "+subset)
plt.savefig("./graphics/"+subset+"_train_losses.png")
plt.savefig("./graphics/"+subset+"_train_losses.pdf")
plt.close()
learn.export(subset+".pkl")
def testing(subset):
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
Original source: scikit-learn documentation
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
title = title+" (normalized) \n"
else:
title = title+"\n"
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
test = pd.read_csv("./data/"+subset+"_test.csv", usecols=["file","label"])
y_true = test["label"].values
learn = load_learner("./",subset+".pkl", test=ImageList.from_df(test, path=""))
y_pred,_ = learn.get_preds(ds_type=DatasetType.Test)
y_pred = np.argmax(y_pred.numpy(), axis=1)
acc = skmetrics.accuracy_score(y_true, y_pred)
title = subset+" Acc:"+str(np.round(acc,4))+" confusion matrix"
classes = sorted(np.unique(y_true))
plt.figure(figsize=(18,8))
plt.subplot(1,2,1)
plot_confusion_matrix(skmetrics.confusion_matrix(y_true=y_true, y_pred=y_pred), classes, title=title)
plt.subplot(1,2,2)
plot_confusion_matrix(skmetrics.confusion_matrix(y_true=y_true, y_pred=y_pred), classes, title=title, normalize=True)
plt.savefig("./graphics/"+subset+"_confusion_matrix.png")
plt.savefig("./graphics/"+subset+"_confusion_matrix.pdf")
plt.close()
def main():
tick = time.time()
print("fastai version:",fastai.__version__)
print("PyTorch version:",torch.__version__)
subsets = ["EMNIST-MNIST","EMNIST-Digits", "EMNIST-Letters", "EMNIST-Balanced", "EMNIST-Bymerge", "EMNIST-Byclass"]
for subset in subsets:
tick1 = time.time()
print("Running fastai on", subset)
training(subset)
gc.collect()
testing(subset)
print("Subset finished after:",time.time()-tick,"seconds")
gc.collect()
print("Finished after:",(time.time()-tick)/3600,"hours")
if __name__ == "__main__":
main()
It took 6.269596343835195 hours
in total. Here are the results:
EMNIST-MNIST
epoch train_loss valid_loss accuracy time
0 1.758948 0.900828 0.705417 00:16
1 0.859533 0.160136 0.950250 00:16
2 0.444318 0.088437 0.973583 00:16
3 0.259608 0.061491 0.982250 00:15
4 0.173354 0.056510 0.982917 00:15
5 0.129357 0.042662 0.987667 00:16
6 0.107227 0.041561 0.987917 00:16
7 0.090129 0.042532 0.987750 00:16
8 0.081324 0.037910 0.989583 00:16
9 0.073856 0.037616 0.989333 00:16
Subset finished after: 229.53496646881104 seconds






EMNIST-Digits
epoch train_loss valid_loss accuracy time
0 0.621408 0.179451 0.944687 01:00
1 0.160846 0.063142 0.980604 01:00
2 0.089626 0.038934 0.988312 00:59
3 0.071559 0.044202 0.986687 01:00
4 0.060354 0.031984 0.990562 01:00
5 0.052091 0.027022 0.992083 01:01
6 0.045011 0.022862 0.992854 01:01
7 0.040925 0.020127 0.994083 01:00
8 0.038547 0.018354 0.994708 01:00
9 0.036072 0.018189 0.994729 01:00
Subset finished after: 906.8308148384094 seconds






EMNIST-Letters
epoch train_loss valid_loss accuracy time
0 2.176001 0.966784 0.703926 03:01
1 1.005924 0.447574 0.851603 00:33
2 0.629116 0.352484 0.882332 00:32
3 0.491678 0.316396 0.892748 00:33
4 0.426249 0.285679 0.902724 00:33
5 0.390442 0.245990 0.913301 00:32
6 0.357532 0.247692 0.912059 00:32
7 0.333830 0.215326 0.925280 00:31
8 0.312843 0.207082 0.927724 00:32
9 0.308315 0.203592 0.929647 00:32
Subset finished after: 2042.9792199134827 seconds






EMNIST-Balanced
epoch train_loss valid_loss accuracy time
0 2.880712 1.399821 0.583112 02:47
1 1.456914 0.660040 0.775399 00:29
2 0.950243 0.540631 0.811392 00:29
3 0.754311 0.483476 0.831560 00:29
4 0.672728 0.489963 0.827128 00:29
5 0.620223 0.424026 0.841489 00:29
6 0.578693 0.380079 0.864849 00:29
7 0.545485 0.371544 0.864982 00:29
8 0.517935 0.359465 0.868573 00:29
9 0.503609 0.353420 0.870567 00:29
Subset finished after: 3082.569526910782 seconds






EMNIST-By-merge
epoch train_loss valid_loss accuracy time
0 0.757444 0.469373 0.838078 1:16:12
1 0.539941 0.429258 0.851332 02:52
2 0.475716 0.379418 0.862264 02:52
3 0.452739 0.364511 0.868862 02:52
4 0.425480 0.343968 0.872774 02:52
5 0.412752 0.356715 0.869063 02:54
6 0.392824 0.307990 0.887267 02:52
7 0.382148 0.293193 0.890584 02:52
8 0.371294 0.289000 0.891056 02:52
9 0.371566 0.288419 0.891193 02:52
Subset finished after: 11022.80905342102 seconds






EMNIST-By-class
epoch train_loss valid_loss accuracy time
0 0.899894 0.603188 0.791892 1:43:18
1 0.649401 0.499177 0.824868 34:16
2 0.595045 0.473051 0.834281 02:52
3 0.564651 0.455126 0.839411 02:53
4 0.537018 0.459642 0.838042 02:52
5 0.515960 0.408394 0.851253 02:53
6 0.500930 0.400614 0.852320 02:52
7 0.482904 0.386061 0.857292 02:53
8 0.472474 0.384079 0.857149 02:52
9 0.465676 0.383195 0.857557 02:53
Subset finished after: 22570.254970788956 seconds






With increasing class imbalance and sample sizes, we get worse results. However, we have to remind ourselves that this is a transfer learning approach with 10 epochs for quick and dirty evaluation and not a proper approach to break some benchmarks ;). I would have used capsule nets for that anyhow ;).
Auto-Keras on EMNIST
Let’s do the obligatory Auto-Keras test as well.
EMNIST-MNIST


EMNIST-Digits


EMNIST-Letters


EMNIST-Balanced


EMNIST-Byclass and EMNIST-Bymerge
I ran out of memory for this one since auto-keras doesn’t support data generators.
TPOT on EMNIST
Let’s do the obligatory TPOT test as well.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from mnist import MNIST
from sklearn.metrics import accuracy_score
from tpot import TPOTClassifier
import gc
mndata = MNIST('./data')
mndata.select_emnist('mnist')
X_train, y_train = mndata.load_training()
X_test, y_test = mndata.load_testing()
X_train = np.asarray(X_train).reshape(-1,28*28)
X_test = np.asarray(X_test).reshape(-1,28*28)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
gc.collect()
tpot_EMNIST = TPOTClassifier(max_time_mins=120,
max_eval_time_mins=5,
verbosity=2,
n_jobs=-1)
tpot_EMNIST.fit(X_train, y_train)
print(tpot_EMNIST.score(X_test, y_test))
tpot_EMNIST.export('TPOT_EMNIST.py')
y_pred = tpot_EMNIST.predict(X_test)
acc_EMNIST = accuracy_score(y_true=y_test,
y_pred=y_pred)
print(acc_EMNIST)
pickle.dump(y_pred, open("./TPOT_y_pred_EMNIST-mnist.p","wb"))
0.9217


# final pipeline #
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['target'].values, random_state=None)
# Average CV score on the training set was:0.9200166666666666
exported_pipeline = LinearSVC(C=0.5, dual=False, loss="squared_hinge", penalty="l2", tol=0.01)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
References
[1] Cohen et al. (2017): EMNIST: an extension of MNIST to handwritten letters. arXiv:1702.05373
[2] LeCun et al. (1998): Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278-2324.
[3] Xiao et al. (2017): Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms. arXiv:1708.07747
[4] Clanuwat, T.; Bober-Irizar, M.; Kitamoto, A.; Lamb, A.; Yamamoto, K.; Ha, D. (2018): Deep Learning for Classical Japanese Literature. Neural Information Processing Systems 2018 Workshop on Machine Learning for Creativity and Design preprint: arXiv:1812.01718