Source code for concepts.vision.george_vision_pipeline.segmentation_models

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : segmentation_models.py
# Author : Jiayuan Mao
# Email  : maojiayuan@gmail.com
# Date   : 08/05/2023
#
# This file is part of Project Concepts.
# Distributed under terms of the MIT license.

__all__ = ['ImageBasedPCDSegmentationModel', 'PointGuidedImageSegmentationModel', 'InstanceSegmentationResult', 'visualize_instance_segmentation']

import random
from typing import Tuple, List
from dataclasses import dataclass

import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt


[docs] @dataclass class InstanceSegmentationResult(object): masks: np.ndarray """A numpy array of shape (N, H, W) and dtype bool.""" pred_cls: List[str] """A list of length N.""" pred_boxes: List[Tuple[Tuple[int, int], Tuple[int, int]]] """A list of length N. Each element is a tuple of ((x1, y1), (x2, y2)).""" @property def nr_objects(self) -> int: return len(self.pred_cls)
[docs] class ImageBasedPCDSegmentationModel(object):
[docs] def __init__(self, model_name: str = 'maskrcnn_resnet50_fpn_v2', device: str = 'cpu', score_threshold: float = 0.5): from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights from torchvision.models.detection import maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights self.model_name = model_name self.device = torch.device(device) if self.model_name == 'maskrcnn_resnet50_fpn': model_fn = maskrcnn_resnet50_fpn weights = MaskRCNN_ResNet50_FPN_Weights.COCO_V1 elif self.model_name == 'maskrcnn_resnet50_fpn_v2': model_fn = maskrcnn_resnet50_fpn_v2 weights = MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1 else: raise ValueError('Unknown model name: {}.'.format(model_name)) self.model = model_fn(weights=weights) self.model.eval() self.model.to(self.device) self.preprocess = weights.transforms() self.score_threshold = score_threshold
[docs] def segment_image(self, image: np.ndarray) -> InstanceSegmentationResult: """Segment an image into objects and background. Args: image: a numpy array of shape (H, W, 3) and dtype uint8. Returns: A tuple of (segmented_image, object_names, object_bboxes). segmented_image is a numpy array of shape (H, W, 3) and dtype uint8. object_names is a list of strings. object_bboxes is a list of tuples of ((x1, y1), (x2, y2)). """ image = torch.tensor(image.transpose(2, 0, 1) / 255.0, dtype=torch.float32) image = self.preprocess(image).to(self.device) pred = self.model([image])[0] pred_score = list(pred['scores'].detach().cpu().numpy()) pred_t = [pred_score.index(x) for x in pred_score if x > self.score_threshold][-1] masks = (pred['masks'] > 0.5).squeeze().detach().cpu().numpy() pred_class = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in list(pred['labels'].detach().cpu().numpy())] pred_boxes = [ ((int(i[0]), int(i[1])), (int(i[2]), int(i[3]))) for i in list(pred['boxes'].detach().cpu().numpy().astype(np.int64)) ] masks = masks[:pred_t+1] pred_boxes = pred_boxes[:pred_t+1] pred_class = pred_class[:pred_t+1] return InstanceSegmentationResult(masks, pred_class, pred_boxes)
[docs] class PointGuidedImageSegmentationModel(object):
[docs] def __init__(self, checkpoint_path, model: str = 'sam_default', device: str = 'cpu'): from segment_anything import SamPredictor, sam_model_registry if model == 'sam_default': model = sam_model_registry['default'] else: raise ValueError('Unknown model name: {}.'.format(model)) self.model = model(checkpoint=checkpoint_path) self.device = torch.device(device) self.model.to(self.device) self.predicator = SamPredictor(self.model) self.last_image_id = None
[docs] def segment_from_point(self, image, point): if self.last_image_id is None or self.last_image_id != id(image): self.predicator.set_image(image) self.last_image_id = id(image) masks, _, _ = self.predicator.predict(point_coords=np.array([point]), point_labels=np.array([1])) if len(masks) > 1: mask = masks[-2].astype(np.uint8) else: mask = masks[-1].astype(np.uint8) mask = remove_remains(mask, point) return mask
COCO_INSTANCE_CATEGORY_NAMES = [ '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ]
[docs] def remove_remains(img, interest_point): """Remove remains which are not adjacent with interest_point.""" img = img.copy().astype(np.uint8) h, w = img.shape[:2] mask = np.zeros((h + 2, w + 2), np.uint8) img_inv = img.copy() cv2.floodFill(img_inv, mask, tuple(interest_point), 0) img -= img_inv return img
[docs] def random_colored_mask(image): colours = [ [0, 255, 0], [0, 0, 255], [255, 0, 0], [0, 255, 255], [255, 255, 0], [255, 0, 255], [80, 70, 180], [250, 80, 190], [245, 145, 50], [70, 150, 250], [50, 190, 190] ] rgb = np.array(random.choice(colours)) rgb = np.reshape(rgb, (1, 1, 3)) image = image[:, :, None] image = np.where(image != 0, image * rgb, image) return image.astype(np.uint8)
[docs] def visualize_instance_segmentation(image: np.ndarray, result: InstanceSegmentationResult, rect_th: int = 3, text_size: int = 1, text_th: int = 3): image = image.copy() for i in range(len(result.masks)): if 'table' in result.pred_cls[i]: continue rgb_mask = random_colored_mask(result.masks[i]) image = cv2.addWeighted(image, 1, rgb_mask, 0.5, 0) cv2.rectangle(image, result.pred_boxes[i][0], result.pred_boxes[i][1], color=(0, 255, 0), thickness=rect_th) cv2.putText(image, result.pred_cls[i], (result.pred_boxes[i][0][0], result.pred_boxes[i][0][1] + 10), cv2.FONT_HERSHEY_SIMPLEX, text_size, (0, 255, 0), thickness=text_th) plt.figure(figsize=(10, 10)) plt.imshow(image) plt.xticks([]) plt.yticks([]) plt.tight_layout() plt.show()