# Fungi top 4 data

In [None]:
from pathlib import Path

In [None]:
p = Path("/xdisk/sohampal/sohampal/fungi/DF20M-4/") # replace this with the path to where you dowloaded the data
list(p.iterdir())

In [None]:
from itertools import islice

def _ls(p, n=None): return list(islice(p.iterdir(), n)) # more convenient to use this to list directory contents

In [None]:
_ls(p / "train")

In [None]:
_ls(p / "test")

In [None]:
_ls(p / "test" / "Mycena galericulata", 4)

In [None]:
{d.name: len(_ls(d)) for d in _ls(p / "train")} # number of images for each fungi species

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

def show_images(p, ncols=8):
    fig, axes = plt.subplots(1, ncols, figsize=(20, 2))
    imgs = p.iterdir()
    for i, ax in enumerate(axes):
        img = next(imgs)
        ax.imshow(Image.open(img))
        ax.axes.get_xaxis().set_ticks([])
        ax.axes.get_yaxis().set_ticks([])
    fig.suptitle(p.name)

for d in _ls((p / "train")): show_images(d)

# Create data for binary classification

Copy the data to a subdirectory called `binary` under `DF2OM-4`. This is not strictly necessary, if you plan to write your own dataloaders. But it will be easier to use the fastai dataloaders in this way.

In [None]:
import shutil

def copy_images(partition):
    for fungi in ("Boletus edulis", "Amanita muscaria"):
        (p / "binary" / partition / fungi).mkdir(parents=True, exist_ok=True)
        for img in (p / partition / fungi).iterdir():
            shutil.copy2(img, p / "binary" / partition / fungi)

copy_images("train")
copy_images("test")

In [None]:
{d.name: len(_ls(d)) for d in _ls(p / "binary" / "train")}

In [None]:
{d.name: len(_ls(d)) for d in _ls(p / "binary" / "test")}

# Binary classification with fastai

We will randomly split the data under the `train` subdirectory into a train set and a validation set. By default, fastai's `RandomSplitter` does a 80:20 split. After we have trained the model on the train set, we will test it with the images under the `test` subdirectory. The model has never seen these images, and thus this is a good way to test the generalization of the model. Sometimes, when we don't have a lot of data, we only split the data into a train set and a validation set.

In [None]:
from fastai.data.all import *
from fastai.vision.all import *

In [None]:
# this is just to demonstrate what the `parent_label` function from fastai does.
def label_func(fname): return fname.parent.name

In [None]:
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
                   get_items = get_image_files,
                   get_y = parent_label,
                   splitter = RandomSplitter(),
                   item_tfms = Resize(224))

In [None]:
# Datasets are essetially collections of inputs and corresponding targets
dsets = dblock.datasets(p / "binary" / "train")
dsets.train[0]

In [None]:
dsets.vocab

In [None]:
dls = dblock.dataloaders(p / "binary" / "train") # Dataloaders load the data in batches, default batch size is 64
dls.show_batch() # by default only 9 elements of a batch are shown

We will use a pretrained Resnet-34 model. The `vision_learner` object from fastai abstracts the boiler plate code used to finetune or train a model.

In [None]:
learn = vision_learner(dls, resnet34, metrics=error_rate)

In [None]:
learn.lr_find() # this recommends a range of suitable learning rates, for more info check fastai docs

In [None]:
learn.fine_tune(3, 2e-3) # fine tune the Resnet-34 model for 3 epochs

In [None]:
learn.show_results()

In [None]:
# the interpretation object helps to get a better sense of the trained model
interp = ClassificationInterpretation.from_learner(learn) 
interp.plot_top_losses(9, figsize=(15, 11))

In [None]:
interp.plot_confusion_matrix()

In [None]:
interp.print_classification_report()

## Test accuracy

In [None]:
test_files = get_image_files(p / "binary" / "test") 
test_dl = dls.test_dl(test_files, with_labels=True) 

In [None]:
preds = learn.get_preds(dl=test_dl)
for index, item in enumerate(preds[0]): 
    prediction = dls.categorize.decode(torch.argmax(item)) 
    confidence = max(item) 
    percent = float(confidence) 
    print(f"Prediction: {prediction:18} - Confidence: {percent:.2%} - Image: {test_dl.items[index].name}")

In [None]:
interp = ClassificationInterpretation.from_learner(learn, dl=test_dl)
interp.plot_confusion_matrix()

In [None]:
interp.print_classification_report()

# Multi-class classification with fastai

The code for the multi-class classification can be exactly the same as for the binary classification. However, we will replace the `RandomSplitter` with the `GrandparentSplitter` from fastai just to demonstrate it. We will not split the images under `train` into a train set and a validation set. We will use all the images under `train` as the train set, and all the images under `test` as the validation set. There will be no seperate test set. 

This is not us recommending one split (train/validation) over another (train/validation/test), but just demonstrating how we can do either.

In [None]:
dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
                   get_items = get_image_files,
                   get_y = parent_label,
                   splitter = GrandparentSplitter(train_name="train", valid_name="test"),
                   item_tfms = Resize(224))

dsets = dblock.datasets(p)
dsets.vocab

In [None]:
dls = dblock.dataloaders(p)
dls.show_batch()

In [None]:
learn = vision_learner(dls, resnet34, metrics=error_rate)
learn.lr_find()

In [None]:
learn.fine_tune(5, 2e-3)

In [None]:
learn.show_results()

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(9, figsize=(16, 11))

In [None]:
learn.export("resnet32-4.pkl") # Save the model