Face masks play a central role in protecting the health of community against COVID-19. In this project, we study the Face Mask Detection Dataset, which contains 853 images of people in many daylife situations. Each image contain annotations that are divided in 3 classes (with mask, without mask, mask worn incorrectly), together with the coresponding bounding boxes in the PASCAL VOC format.

In particular, we will train a neural network for detecting people faces in images and classifying whether they wear face masks correctly. The objective is to adopt deep learning technique to detect if people violating norms of wearing masks in public. We are not training the our neural network from scratch, but employing the pretrained ResNet model and finetune it for our purpose.

Let first import the needed packages:

import os
import numpy as np
import pandas as pd

import torch
import torchvision
from torchvision import transforms, datasets, models
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from bs4 import BeautifulSoup

Prepare Data

Next, we need some helper function to generate training data from the extracted dataset. We use BeautifulSoup package to extract information from the .xml annotation files. Each identified object is assigned an integer label ("without_masks" = 1, "with_mask" = 2, "mask_weared_incorrect" = 3), as desired by Pytorch. Besides, the corresponding bounding boxes are also converted to Pytorch format [xmin, ymin, xmax, ymax]

def generate_box(obj):
    # get bounding box coordinates in pytorch format for a given object
    xmin = int(obj.find('xmin').text)
    ymin = int(obj.find('ymin').text)
    xmax = int(obj.find('xmax').text)
    ymax = int(obj.find('ymax').text) 
    return [xmin, ymin, xmax, ymax]

def generate_label(obj):
    # assign label to object. Note that the label starts from 1, 
    # since label 0 is reserved for the background 
    if obj.find('name').text == "with_mask":
        return 2
    elif obj.find('name').text == "mask_weared_incorrect":
        return 3
    return 1

def generate_target(image_id, file): 
    # generate training target from the annotation file
    with open(file) as f:
        data = f.read()
        # read xml file
        soup = BeautifulSoup(data, 'xml')
        objects = soup.find_all('object')
        num_objs = len(objects)

        boxes = []
        labels = []
        for i in objects: # extract annotation of each object
            boxes.append(generate_box(i))
            labels.append(generate_label(i))

        # convert all to pytorch tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        img_id = torch.tensor([image_id])
        # save annotation in dictionary for each image
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = img_id
        
        return target

Now we are defining our custom dataset. Our dataset class should inherit from torch.utils.data.Dataset and implement the functions __getitem__ and __len__

class MaskDataset(torch.utils.data.Dataset):
  def __init__(self, transforms):
        self.transforms = transforms
        # load all image and annotation files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir("data/images/")))
        self.masks = list(sorted(os.listdir("data/annotations/")))
  def __getitem__(self, idx):
        # load images and masks
        img_path   = os.path.join("data/images/",  self.imgs[idx])
        label_path = os.path.join("data/annotations/",  self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        #Generate Label
        target = generate_target(idx, label_path)
        
        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

  def __len__(self):
        return len(self.imgs)

We are curious how the masks are distributed. To findout, we read all labels available in the dataset, and use Counter to get the number of each class

 all_labels = []
ann_paths =  list(sorted(os.listdir("data/annotations/")))
for mask_path in ann_paths:
   with open(os.path.join("data/annotations/", mask_path)) as f:
      data = f.read()
      # read xml file
      soup = BeautifulSoup(data, 'xml')
      objects = soup.find_all('object')
      num_objs = len(objects)
      labels = []
      for i in objects:
          all_labels.append(generate_label(i))
  from collections import Counter

class_names = ["without_masks", "with_mask", "mask_weared_incorrect"]
values = Counter(all_labels).values()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize =(14,6))
background_color = '#faf9f4'
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color) 
ax1.pie(values,wedgeprops=dict(width=0.3, edgecolor='w') ,
        labels=class_names, radius=1, startangle = 180, autopct='%1.2f%%')

ax2 = plt.bar(class_names, list(values),
              color ='maroon',width = 0.4)
   
plt.show()

We see that most annotations are about people wearing mask. image

Next, we define another helper function to display the annotation, with different colors indicating different labels:

colours = ['r', 'g', 'b']
def plot_image(img_tensor, annotation, display_ann = True):
    
    fig,ax = plt.subplots(1)
    img = img_tensor.cpu().data

    # Display the image
    ax.imshow(img.permute(1, 2, 0))
    n_boxes = len(annotation["boxes"])
    for i in range(n_boxes):
        box = annotation["boxes"][i]
        col = annotation["labels"][i]
        xmin, ymin, xmax, ymax = box

        # Create a Rectangle patch
        rect = patches.Rectangle((xmin,ymin),(xmax-xmin),(ymax-ymin),linewidth=1,edgecolor=colours[col-1],facecolor='none')

        # Add the patch to the Axes
        ax.add_patch(rect)
        if display_ann:
          ax.annotate(class_names[col-1], ( xmin, ymin),color=colours[col-1], weight='bold', fontsize=10, ha='left', va='baseline' )

    plt.show()

Let’s split the data into training set and test set with random_split. As we have 853 data points, we will use 700 of them for training and 153 for testing. We also define the data transformator and our dataloader

data_transform = transforms.Compose([transforms.ToTensor()])
def collate_fn(batch):
    return tuple(zip(*batch))

batch_size = 4
dataset = MaskDataset(data_transform)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [700, 153])
train_dl = torch.utils.data.DataLoader(
 train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle = True)
test_dl = torch.utils.data.DataLoader(
 test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle = True)
for imgs, annotations in test_dl: # take one batch for visualizing
      imgs = list(img.to(device) for img in imgs)
      annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
      break
for i in range(1):
  plot_image(imgs[i], annotations[i])

image

Define Model

We will employ Faster R-CNN for our detector. Faster R-CNN is a model that predicts both bounding boxes and class scores for potential objects in the image. We will start from the model that was pre-trained on COCO train2017, and replace the last layer with our custom classifier.

def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # get number of input features for the classifier
    # i.e. we get the number of output features of the second last layer
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    # note that we need an extra class for the background, hence +1
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes + 1)

    return model

```python
Now let's instantiate the model and the optimizer as well as the learning rate scheduler

device = torch.device(‘cuda’) if torch.cuda.is_available() else torch.device(‘cpu’)

num_classes = 3 model = get_instance_segmentation_model(num_classes).to(device)

optimizer

params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

learning rate scheduler

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

And train the model for several epochs
```python
num_epochs = 10
len_dataloader = len(train_dl)

for epoch in range(num_epochs):
    model.train()
    i = 0    
    epoch_loss = 0
    for imgs, annotations in train_dl:
        i += 1
        imgs = list(img.to(device) for img in imgs)
        annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
        loss_dict = model(imgs, annotations)
        losses = sum(loss for loss in loss_dict.values())        

        optimizer.zero_grad()
        losses.backward()
        optimizer.step() 
        
        print(f'Batch: {i}/{len_dataloader}, Loss: {losses.item()}')
        epoch_loss += losses.item()
    lr_scheduler.step()
    print(f'>>>>>>> Done epoch {epoch}, loss {epoch_loss/i}' )

In some case, the loaded model become too large and can take up a lot of memory. It is possible to delete unused variables and empty pytorch cache:

# avoid CUDA out of memory
import gc
gc.collect()
torch.cuda.empty_cache()

del losses, imgs, annotations,  

We can test the trained model on some unseen images:


batch_size = 8
test_dl = torch.utils.data.DataLoader(
 test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle = True)
for imgs, annotations in test_dl: # take one batch for testing
      imgs = list(img.to(device) for img in imgs)
      annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
      break
model.eval()

with torch.no_grad():
  preds = model(imgs[:batch_size])

for i in range(batch_size):
  print("Prediction")
  plot_image(imgs[i], preds[i], False)
  print("Target")
  plot_image(imgs[i], annotations[i], False)

We see that the model performance for the class 1 and 2 are worse than for the class 3 (“with_masks”). This can be due to the fact that these 2 classes are underpresented in the training data:

42 Capture 1 2 3

Save and Load model

Saving models for later uses and loading the saved models are as simple as follows:

# save model
torch.save(model.state_dict(),'model.pt')

# load model
model2 = get_instance_segmentation_model(3)
model2.load_state_dict(torch.load('model.pt'))
model2 = model2.to(device)