Image Processing x Machine Learning: Classifying Leaves
Visualizing an example
import numpy as np
import matplotlib.pyplot as plt
import skimage.io as skio
from skimage import img_as_ubyte, img_as_float
from skimage.io import imread, imshowplt.figure(dpi=200)
plant = imread('plantA_1.jpg')
imshow(plant);

Let’s try preparing this image for segmentation. We also crop the borders of the image as there some to be extra pixels from the original picture:
from skimage.morphology import erosion, dilation, opening, closing
from skimage.measure import label, regionprops
from skimage.color import label2rgbdef multi_dil(im,num):
for i in range(num):
im = dilation(im)
return imdef multi_ero(im,num):
for i in range(num):
im = erosion(im)
return implt.figure(dpi=200)
leaves = 1.0 * ((plant/255) < 0.4)[5:-5,5:-5,0]
imshow(leaves);

Let’s now try to segment this image:
plt.figure(dpi=200)
label_im = label(leaves)
imshow(label_im);

Reading the Images
import glob
import pandas as pd
import replants = glob.glob('plant*')
df = pd.DataFrame(plants, columns = ['filename'])
df.head()

Let’s get the labels from the filenames:
def get_label(x):
try:
result = re.search('(?<=plant).+(?=\_)', str(x))[0]
except:
result = re.search('(?<=plant).+(?=\.)', str(x))[0]
return resultdf['label'] = df.apply(lambda x: get_label(x), result_type = 'broadcast', axis = 1)df

Cleaning the Images
We define the following thresholds per label obtained through trial and error:
thresh = {'A':0.4, 'B':0.4, 'C':0.4, 'D':0.7, 'E':0.7}
We then proceed with define our cleaning by erosion and dilation:
def clean_image(img, t=0.5):
plant = imread(img)
leaves = 1.0 * ((plant/255) < t)
im_cleaned = multi_dil(multi_ero(leaves,3),3)[5:-5,5:-5,0] return im_cleaned
Then we apply this to all the files and store the resulting image for further processing:
df['cleaned'] = df.apply(lambda x: clean_image(x.filename, thresh[x.label]), axis=1)
We then segment each image to their individual leaves:
def get_regions(im_cleaned):
label_im = label(im_cleaned)
return regionprops(label_im)df['regions'] = df.apply(lambda x: get_regions(x.cleaned), axis=1)data = {}
index = 0
for x in df.index:
label = df['label'].iloc[x]
filename = df['filename'].iloc[x]
for i in df['regions'].iloc[x]:
data[index] = {'leaf':i,
'label': label,
'filename':filename}
index += 1data_df = pd.DataFrame(data).Tdata_df

From here we can see that we’ve ended up with 260 leaves.
Extracting Features
Let us now extract features from the resulting individual leaves. We will get the following features:
- Length
- Width
- Perimeter
- Area
- Centroid Location
Let us first define the functions to get these features:
def get_size_coordinates(props):
x1, y1, x2, y2 = props.bbox
return x1, y1, x2, y2def get_length(coord):
return coord[2]-coord[0]def get_width(coord):
return coord[3]-coord[1]def get_peri(leaf):
return leaf.perimeterdef get_area(leaf):
return leaf.areadef get_centroid(leaf):
return leaf.local_centroiddef get_cx(length, centroid):
return length-centroid[0]def get_cy(width, centroid):
return width-centroid[1]
Then we apply the above functions to each leaf:
data_df['coordinates'] = data_df.apply(lambda x: get_size_coordinates(x.leaf), axis=1)data_df['length'] = data_df.apply(lambda x: get_length(x.coordinates), axis = 1)
data_df['width'] = data_df.apply(lambda x: get_width(x.coordinates), axis = 1)data_df['perimeter'] = data_df.apply(lambda x: get_peri(x.leaf), axis = 1)data_df['area'] = data_df.apply(lambda x: get_area(x.leaf), axis = 1)data_df['centroid'] = data_df.apply(lambda x: get_centroid(x.leaf), axis = 1)data_df['cx'] = data_df.apply(lambda x: get_cx(x.length, x.centroid), axis = 1)
data_df['cy'] = data_df.apply(lambda x: get_cy(x.width, x.centroid), axis = 1)data_df[['length', 'width', 'area', 'perimeter', 'cx', 'cy', 'label']]

Let’s visualize these features through a pairplot:
import seaborn as sns
sns.set(style="ticks")df = data_df[['length', 'width', 'area', 'perimeter', 'cx', 'cy', 'label']]
sns.pairplot(df, hue="label", diag_kind='kde')

data_df.groupby('label').size()

Machine Learning Models
We then pass the resulting dataframe though machine learning models namely:
- KNN
- Logistic Regression (L1 & L2 Regularizations)
- SVC (L1 & L2 Regularizations)
But first let us compute for the proportion chance criterion:
sizes = data_df.groupby('label').size()
sizes_sum = sizes.sum()((sizes/sizes_sum)**2).sum()*100
We end up with a PCC of 20.074573185948534%
Running our input through the aforementioned Machine Learning models, we acquire the following accuracies:

We then conclude that Logistic Regression with an L1 Regularization is the best model for our dataset with length as the top predictor across the board.