Exploring EMNIST - another MNIST-like dataset

Contents

Introduction
Quick models using fast.ai on EMNIST
Auto-Keras on EMNIST
TPOT on EMNIST
Comparison of Results

Introduction

EMNIST [1] is another MNIST-like dataset similar to the original MNIST [2], Fashion-MNIST [3] and Kuzushiji-MNIST [4]. Similar to other MNIST-like datasets, such as Kuzushiji-MNIST, EMNIST is not a single dataset but consists of 6 datasets containing various classes of letters and digits. Some class distributions are balanced, others are not.

If you like MNIST-like datasets, then have a look at CMNIST as well.

EMNIST-By_Class

EMNIST-By_Class consists of 62 classes containing 814255 samples.

Let’s have a look at some examples:

EMNIST-By_Merge

EMNIST-By_Merge consists of 47 classes containing 814255 samples.

Let’s have a look at some examples here:

EMNIST-Balanced

EMNIST-Balanced consists of 47 classes containing 131600 samples.

Let’s have a look at some examples:

EMNIST-Letters

EMNIST-Letters consists of 26 classes containing 145600 samples.

Let’s have a look at some examples of this dataset:

EMNIST-Digits

EMNIST-Digits consists of 10 classes containing 280000 samples.

Let’s have a look at some examples of this dataset:

EMNIST-MNIST

EMNIST-Digits consists of 10 classes containing 70000 samples.

Let’s have a look at some examples of this dataset:

Quick models using fastai on EMNIST

Loading custom datasets for fastai is a fucking pain in the ass. Standard data generators for PyTorch hardly work, and all onboard solutions are quite messy and end with some hardly documented code. Therefore, we have to write a little function to dump EMNIST (sub) datasets to create a imagenet compatible dataset. Probably, there is some workaround, but this is the fastest thing that came to my mind.

# dumpImages.py #
import numpy as np
import pandas as pd
from mnist import MNIST
import gc
import os
from PIL import Image
from sklearn.model_selection import train_test_split


def dumpToImages(X,y,SetType,DatasetName,Path):
    def saveImg(X,y,fullPath):
        imgDict = {}
        for i in range(len(X)):
            imgTmp = np.asarray(X[i]).reshape(28,28)
            imgTmp = np.stack((imgTmp,)*3, axis=-1)
            imgTmp = imgTmp.astype(np.uint8)
            img = Image.fromarray(imgTmp)
            filePathAndName = fullPath+str(y[i])+"/"+str(i)+".png"
            img.save(filePathAndName)
            imgDict[i] = {"file": filePathAndName,
                          "label": y[i]}
        print("Images dumped")
        return imgDict

    if not os.path.exists(Path+DatasetName):
        os.makedirs(Path+"/"+DatasetName+"/")
    fullPath = Path+DatasetName+"/"+SetType+"/"
    y_unique = np.unique(y)

    if SetType == "train":
        if not os.path.exists(fullPath):
            os.makedirs(fullPath)
        for i in y_unique:
            if not os.path.exists(fullPath+"/"+str(i)+"/"):
                os.makedirs(fullPath+"/"+str(i)+"/")
        result = saveImg(X,y,fullPath)
    elif SetType == "valid":
        if not os.path.exists(fullPath):
            os.makedirs(fullPath)
        for i in y_unique:
            if not os.path.exists(fullPath+"/"+str(i)+"/"):
                os.makedirs(fullPath+"/"+str(i)+"/")
        result = saveImg(X,y,fullPath)
    elif SetType == "test":
        if not os.path.exists(fullPath):
            os.makedirs(fullPath)
        for i in y_unique:
            if not os.path.exists(fullPath+"/"+str(i)+"/"):
                os.makedirs(fullPath+"/"+str(i)+"/")
        result = saveImg(X,y,fullPath)

    return pd.DataFrame.from_dict(result, orient="index")

def dump_subset(subset,subsetName):
    print("Dumping", subsetName)
    mndata = MNIST("./data/orig")
    mndata.select_emnist(subset)
    X_train, y_train = mndata.load_training()
    y_train = np.asarray(y_train)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2,shuffle=True, random_state=42)

    X_test, y_test = mndata.load_testing()
    y_test = np.asarray(y_test)

    DF_train = dumpToImages(X_train, y_train, "train", subsetName,"./data/")
    DF_train.to_csv("./data/"+subsetName+"_train.csv")
    del X_train
    del y_train
    gc.collect()

    DF_valid = dumpToImages(X_valid, y_valid, "valid", subsetName,"./data/")
    DF_valid.to_csv("./data/"+subsetName+"_valid.csv")
    del X_valid
    del y_valid
    gc.collect()

    DF_test = dumpToImages(X_test, y_test, "test", subsetName,"./data/")
    DF_test.to_csv("./data/"+subsetName+"_test.csv")
    del X_test
    del y_test
    gc.collect()
    print(subsetName," dumped")

def main():
    subsets = ["mnist", "digits", "letters", "balanced", "bymerge", "byclass"]
    subsetNames = ["EMNIST-MNIST","EMNIST-Digits", "EMNIST-Letters", "EMNIST-Balanced", "EMNIST-Bymerge", "EMNIST-Byclass"]

    for i in range(len(subsets)):
        dump_subset(subsets[i], subsetNames[i])

if __name__ == "__main__":
    main()

I think manual, repetitive work is stupid, I wrote a little script to explore how some very basic models perform on EMNIST:

# train_test_fastai_models.py #
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import pickle
import random
from sklearn.model_selection import train_test_split
from PIL import Image
import PIL
import copy
import itertools
import sklearn.metrics as skmetrics

import torch
import torch.nn as nn


import fastai
from fastai import *
from fastai.vision import *
from fastai.callbacks import *
from fastai.basic_train import *
from fastai.vision.learner import *



def training(subset):
    train_df = pd.read_csv("./data/"+subset+"_train.csv", usecols=["file","label"])
    valid_df = pd.read_csv("./data/"+subset+"_valid.csv", usecols=["file","label"])
    train_valid_df= train_df.append(valid_df, ignore_index=True)
    ds_tfms = get_transforms(do_flip=False, flip_vert=False, max_rotate=30, max_zoom=1.1, max_warp=0.3)
    # ImageDataBunch.from_folder seems to be dysfunctional 
    data = ImageDataBunch.from_df("", train_valid_df, ds_tfms=ds_tfms, size=28, bs=1024)
    print(data)
    learn = cnn_learner(data, models.resnet18, metrics=accuracy)
    learn.lr_find(start_lr=1e-6, end_lr=1e1, stop_div=True, num_it=200)

    plt.figure()
    learn.recorder.plot(suggestion=True)
    plt.title("Optimal Learning Rate - "+subset)
    plt.savefig("./graphics/"+subset+"_lr_selection.png")
    plt.savefig("./graphics/"+subset+"_lr_selection.pdf")
    plt.close()

    lr = learn.recorder.min_grad_lr
    print(lr)
    learn.fit_one_cycle(10, lr)


    plt.figure()
    learn.recorder.plot_metrics()
    plt.title("Training accuracies - "+subset)
    plt.savefig("./graphics/"+subset+"_train_accs.png")
    plt.savefig("./graphics/"+subset+"_train_accs.pdf")
    plt.close()

    plt.figure()
    learn.recorder.plot_lr()
    plt.title("Training Learning Rates - "+subset)
    plt.savefig("./graphics/"+subset+"_train_lr.png")
    plt.savefig("./graphics/"+subset+"_train_lr.pdf")
    plt.close()

    plt.figure()
    learn.recorder.plot_losses()
    plt.title("Training Losses - "+subset)
    plt.savefig("./graphics/"+subset+"_train_losses.png")
    plt.savefig("./graphics/"+subset+"_train_losses.pdf")
    plt.close()

    learn.export(subset+".pkl")


def testing(subset):
    def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.

        Original source: scikit-learn documentation
        """
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            title = title+" (normalized) \n"
        else:
            title = title+"\n"

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes)
        plt.yticks(tick_marks, classes)

        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()

    test = pd.read_csv("./data/"+subset+"_test.csv", usecols=["file","label"])
    y_true = test["label"].values
    learn = load_learner("./",subset+".pkl", test=ImageList.from_df(test, path=""))
    y_pred,_ = learn.get_preds(ds_type=DatasetType.Test)
    y_pred = np.argmax(y_pred.numpy(), axis=1)
    acc = skmetrics.accuracy_score(y_true, y_pred)
    title = subset+" Acc:"+str(np.round(acc,4))+" confusion matrix"
    classes = sorted(np.unique(y_true))

    plt.figure(figsize=(18,8))
    plt.subplot(1,2,1)
    plot_confusion_matrix(skmetrics.confusion_matrix(y_true=y_true, y_pred=y_pred), classes, title=title)
    plt.subplot(1,2,2)
    plot_confusion_matrix(skmetrics.confusion_matrix(y_true=y_true, y_pred=y_pred), classes, title=title, normalize=True)
    plt.savefig("./graphics/"+subset+"_confusion_matrix.png")
    plt.savefig("./graphics/"+subset+"_confusion_matrix.pdf")
    plt.close()



def main():
    tick = time.time()
    print("fastai version:",fastai.__version__)
    print("PyTorch version:",torch.__version__)

    subsets = ["EMNIST-MNIST","EMNIST-Digits", "EMNIST-Letters", "EMNIST-Balanced", "EMNIST-Bymerge", "EMNIST-Byclass"]

    for subset in subsets:
        tick1 = time.time()
        print("Running fastai on", subset)
        training(subset)
        gc.collect()
        testing(subset)
        print("Subset finished after:",time.time()-tick,"seconds")
        gc.collect()
    print("Finished after:",(time.time()-tick)/3600,"hours")



if __name__ == "__main__":
    main()

It took 6.269596343835195 hours in total. Here are the results:

EMNIST-MNIST

epoch     train_loss  valid_loss  accuracy  time    
       1.758948    0.900828    0.705417  00:16                                                                                                     
       0.859533    0.160136    0.950250  00:16                                                                                                     
       0.444318    0.088437    0.973583  00:16                                                                                                     
       0.259608    0.061491    0.982250  00:15                                                                                                     
       0.173354    0.056510    0.982917  00:15                                                                                                     
       0.129357    0.042662    0.987667  00:16                                                                                                     
       0.107227    0.041561    0.987917  00:16                                                                                                     
       0.090129    0.042532    0.987750  00:16                                                                                                     
       0.081324    0.037910    0.989583  00:16                                                                                                     
       0.073856    0.037616    0.989333  00:16                                                                                                      
Subset finished after: 229.53496646881104 seconds

EMNIST-Digits

epoch     train_loss  valid_loss  accuracy  time    
       0.621408    0.179451    0.944687  01:00                                                                                                       
       0.160846    0.063142    0.980604  01:00                                                                                                       
       0.089626    0.038934    0.988312  00:59                                                                                                       
       0.071559    0.044202    0.986687  01:00                                                                                                       
       0.060354    0.031984    0.990562  01:00                                                                                                       
       0.052091    0.027022    0.992083  01:01                                                                                                       
       0.045011    0.022862    0.992854  01:01                                                                                                       
       0.040925    0.020127    0.994083  01:00                                                                                                       
       0.038547    0.018354    0.994708  01:00                                                                                                       
       0.036072    0.018189    0.994729  01:00                                                                                                        
Subset finished after: 906.8308148384094 seconds 

EMNIST-Letters

epoch     train_loss  valid_loss  accuracy  time    
       2.176001    0.966784    0.703926  03:01                                                                                                     
       1.005924    0.447574    0.851603  00:33                                                                                                     
       0.629116    0.352484    0.882332  00:32                                                                                                     
       0.491678    0.316396    0.892748  00:33                                                                                                     
       0.426249    0.285679    0.902724  00:33                                                                                                     
       0.390442    0.245990    0.913301  00:32                                                                                                     
       0.357532    0.247692    0.912059  00:32                                                                                                     
       0.333830    0.215326    0.925280  00:31                                                                                                     
       0.312843    0.207082    0.927724  00:32                                                                                                     
       0.308315    0.203592    0.929647  00:32                                                                                                      
Subset finished after: 2042.9792199134827 seconds

EMNIST-Balanced

epoch     train_loss  valid_loss  accuracy  time    
       2.880712    1.399821    0.583112  02:47                                                                                                     
       1.456914    0.660040    0.775399  00:29                                                                                                     
       0.950243    0.540631    0.811392  00:29                                                                                                     
       0.754311    0.483476    0.831560  00:29                                                                                                     
       0.672728    0.489963    0.827128  00:29                                                                                                     
       0.620223    0.424026    0.841489  00:29                                                                                                     
       0.578693    0.380079    0.864849  00:29                                                                                                     
       0.545485    0.371544    0.864982  00:29                                                                                                     
       0.517935    0.359465    0.868573  00:29                                                                                                     
       0.503609    0.353420    0.870567  00:29                                                                                                      
Subset finished after: 3082.569526910782 seconds 

EMNIST-By-merge

epoch     train_loss  valid_loss  accuracy  time    
       0.757444    0.469373    0.838078  1:16:12                                                                                                     
       0.539941    0.429258    0.851332  02:52                                                                                                       
       0.475716    0.379418    0.862264  02:52                                                                                                       
       0.452739    0.364511    0.868862  02:52                                                                                                       
       0.425480    0.343968    0.872774  02:52                                                                                                       
       0.412752    0.356715    0.869063  02:54                                                                                                       
       0.392824    0.307990    0.887267  02:52                                                                                                       
       0.382148    0.293193    0.890584  02:52                                                                                                       
       0.371294    0.289000    0.891056  02:52                                                                                                       
       0.371566    0.288419    0.891193  02:52                                                                                                        
Subset finished after: 11022.80905342102 seconds

EMNIST-By-class

epoch     train_loss  valid_loss  accuracy  time    
       0.899894    0.603188    0.791892  1:43:18                                                                                                       
       0.649401    0.499177    0.824868  34:16                                                                                                       
       0.595045    0.473051    0.834281  02:52                                                                                                       
       0.564651    0.455126    0.839411  02:53                                                                                                       
       0.537018    0.459642    0.838042  02:52                                                                                                       
       0.515960    0.408394    0.851253  02:53                                                                                                       
       0.500930    0.400614    0.852320  02:52                                                                                                       
       0.482904    0.386061    0.857292  02:53                                                                                                       
       0.472474    0.384079    0.857149  02:52                                                                                                       
       0.465676    0.383195    0.857557  02:53                                                                                                        
Subset finished after: 22570.254970788956 seconds

With increasing class imbalance and sample sizes, we get worse results. However, we have to remind ourselves that this is a transfer learning approach with 10 epochs for quick and dirty evaluation and not a proper approach to break some benchmarks ;). I would have used capsule nets for that anyhow ;).

Auto-Keras on EMNIST

Let’s do the obligatory Auto-Keras test as well.

EMNIST-MNIST

EMNIST-Digits

EMNIST-Letters

EMNIST-Balanced

EMNIST-Byclass and EMNIST-Bymerge

I ran out of memory for this one since auto-keras doesn’t support data generators.

TPOT on EMNIST

Let’s do the obligatory TPOT test as well.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from mnist import MNIST
from sklearn.metrics import accuracy_score
from tpot import TPOTClassifier
import gc


mndata = MNIST('./data')
mndata.select_emnist('mnist')
X_train, y_train = mndata.load_training()
X_test, y_test = mndata.load_testing()
X_train = np.asarray(X_train).reshape(-1,28*28)
X_test = np.asarray(X_test).reshape(-1,28*28)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
gc.collect()
tpot_EMNIST = TPOTClassifier(max_time_mins=120,
                             max_eval_time_mins=5,
                             verbosity=2,
                             n_jobs=-1)
tpot_EMNIST.fit(X_train, y_train)
print(tpot_EMNIST.score(X_test, y_test))
tpot_EMNIST.export('TPOT_EMNIST.py')
y_pred = tpot_EMNIST.predict(X_test)
acc_EMNIST = accuracy_score(y_true=y_test,
                            y_pred=y_pred)
print(acc_EMNIST)
pickle.dump(y_pred, open("./TPOT_y_pred_EMNIST-mnist.p","wb"))

0.9217

# final pipeline #
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.9200166666666666
exported_pipeline = LinearSVC(C=0.5, dual=False, loss="squared_hinge", penalty="l2", tol=0.01)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

References

[1] Cohen et al. (2017): EMNIST: an extension of MNIST to handwritten letters. arXiv:1702.05373

[2] LeCun et al. (1998): Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278-2324.

[3] Xiao et al. (2017): Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms. arXiv:1708.07747

[4] Clanuwat, T.; Bober-Irizar, M.; Kitamoto, A.; Lamb, A.; Yamamoto, K.; Ha, D. (2018): Deep Learning for Classical Japanese Literature. Neural Information Processing Systems 2018 Workshop on Machine Learning for Creativity and Design preprint: arXiv:1812.01718