* Update Cityscapes

zsef123 · zsef123 · commit 32f180fa89a9 · 2020-02-15T11:34:22.000+09:00
- 19 classes check
  - Transform method change
* Add ignore index on Cross Entropy
* Learning Rate 10x for ASPP
* SGD optimizer
* Faster metric calculate
diff --git a/.gitignore b/.gitignore
@@ -4,8 +4,8 @@
 cityscapes
 cityscapes/*
 
-voc
-voc/*
+pascalvoc
+pascalvoc/*
 
 outs
 outs/*
diff --git a/configs/default.yaml b/configs/default.yaml
@@ -5,22 +5,28 @@ data:
     mode: "fine"
     target_type: "semantic"
   loader:
-    batch_size: 2
+    batch_size: 6
+    num_workers: 8
 
 net:
   deeplab:
     pretrained: False
-    resnet: "res53"
+    resnet: "res101"
     head_in_ch: 2048
-    num_classes: 34
+    num_classes: 19
   pointhead:
-    in_c: 546 # 512 + num_classes
-    num_classes: 34
+    in_c: 531 # 512 + num_classes
+    num_classes: 19
     k: 3
     beta: 0.75
 
 run:
   epochs: 200
 
+train:
+  lr: 0.01
+  momentum: 0.9
+  weight_decay: 0.0001
+
 apex:
   opt: "O0"
diff --git a/configs/parser.py b/configs/parser.py
@@ -56,10 +56,10 @@ def __init__(self, path, args=None):
             raise NotImplementedError("Don't use args")
             assert isinstance(args, argparse.Namespace), "Check args"
 
-        path = f"{os.getcwd()}/{path}"
-        default_path = f"{os.getcwd()}/configs/default.yaml"
+        full_path = f"{os.getcwd()}/{path}"
+        default_path = full_path.replace(path, "configs/default.yaml")
         self.init_yaml(default_path)
-        self.update_yaml(path)
+        self.update_yaml(full_path)
 
     def init_yaml(self, path):
         with open(path, 'r') as f:
diff --git a/datas/__init__.py b/datas/__init__.py
@@ -2,24 +2,25 @@
 from torchvision.datasets.voc import VOCSegmentation
 from torchvision.datasets.cityscapes import Cityscapes
 
-from .transforms import Compose, Resize, ToTensor, Normalize, RandomCrop, RandomFlip
+from .transforms import Compose, Resize, ToTensor, Normalize, RandomCrop, RandomFlip, ConvertMaskID
 
 
 def get_voc(C, split="train"):
     if split == "train":
-        Compose([
+        transforms = Compose([
             ToTensor(),
-            RandomCrop((256, 512)),
+            RandomCrop((256, 256)),
+            Resize((256, 256)),
             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]) 
+        ])
     else:
         transforms = Compose([
             ToTensor(),
-            Resize((256, 512)),
+            Resize((256, 256)),
             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
 
-    return VOCSegmentation(**C, image_set=split, transforms=transforms)
+    return VOCSegmentation(C['root'], download=True, image_set=split, transforms=transforms)
 
 
 def get_cityscapes(C, split="train"):
@@ -28,13 +29,15 @@ def get_cityscapes(C, split="train"):
         transforms = Compose([
             ToTensor(),
             RandomCrop(768),
+            ConvertMaskID(Cityscapes.classes),
             RandomFlip(),
             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
     else:
         transforms = Compose([
             ToTensor(),
             Resize(768),
+            ConvertMaskID(Cityscapes.classes),
             Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
     return Cityscapes(**C, split=split, transforms=transforms)
diff --git a/datas/transforms.py b/datas/transforms.py
@@ -15,16 +15,21 @@ def __init__(self, shape):
     def __call__(self, img, mask):
         img, mask = img.unsqueeze(0), mask.unsqueeze(0).float()
         img = F.interpolate(img, size=self.shape, mode="bilinear", align_corners=False)
-        mask = F.interpolate(mask, size=self.shape, mode="bilinear", align_corners=False)
+        mask = F.interpolate(mask, size=self.shape, mode="nearest")
         return img[0], mask[0].byte()
 
 
 class RandomCrop:
     def __init__(self, shape):
         self.shape = [shape, shape] if isinstance(shape, int) else shape
+        self.fill = 0
+        self.padding_mode = 'constant'
 
     def _get_range(self, shape, crop_shape):
-        start = random.randint(0, shape - crop_shape)
+        if shape == crop_shape:
+            start = 0
+        else:
+            start = random.randint(0, shape - crop_shape)
         end = start + crop_shape
         return start, end
 
@@ -79,3 +84,20 @@ def __call__(self, img, mask):
         for t in self.transforms:
             img, mask = t(img, mask)
         return img, mask
+
+
+class ConvertMaskID:
+    """
+    Convert 34 classes to 19 classes
+
+    Change the `id` value of CityscapesClass to `train_id`
+    """
+    def __init__(self, classes):
+        self.classes = classes
+
+    def __call__(self, img, mask):
+        mask_train_id = mask.clone()
+        for c in self.classes:
+            mask_train_id[mask == c.id] = c.train_id
+
+        return img, mask_train_id
diff --git a/infer.py b/infer.py
@@ -1,13 +1,16 @@
-import torch
+import time
 import logging
 
+import torch
+
 from utils.metrics import ConfusionMatrix
 
 
 @torch.no_grad()
 def infer(loader, net, device):
     net.eval()
-    metric = ConfusionMatrix(len(loader.dataset.classes) - 1)
+    num_classes = 19 # Hard coding for Cityscapes
+    metric = ConfusionMatrix(num_classes)
     for i, (x, gt) in enumerate(loader):
         x = x.to(device, non_blocking=True)
         gt = gt.squeeze(1).to(device, dtype=torch.long, non_blocking=True)
diff --git a/main.py b/main.py
@@ -75,7 +75,11 @@ def set_loggging(save_dir):
         PointHead(**C.net.pointhead)
     ).to(device)
 
-    optim = torch.optim.AdamW(net.parameters())
+    params = [{"params": net.backbone.backbone.parameters(),   "lr": C.train.lr},
+              {"params": net.head.parameters(),                "lr": C.train.lr},
+              {"params": net.backbone.classifier.parameters(), "lr": C.train.lr * 10}]
+
+    optim = torch.optim.SGD(params, momentum=C.train.momentum, weight_decay=C.train.weight_decay)
 
     net, optim = amp.initialize(net, optim, opt_level=C.apex.opt)
     if args.distributed:
diff --git a/model/deeplab.py b/model/deeplab.py
@@ -6,6 +6,7 @@
 from torchvision.models.segmentation.deeplabv3 import DeepLabHead
 from torchvision.models.segmentation.fcn import FCNHead
 from .resnet import resnet103, resnet53
+from torchvision.models import resnet50, resnet101
 
 
 class SmallDeepLab(_SimpleSegmentationModel):
@@ -17,8 +18,10 @@ def forward(self, input_):
 
 def deeplabv3(pretrained=False, resnet="res103", head_in_ch=2048, num_classes=21):
     resnet = {
-        "res53": resnet53,
-        "res103": resnet103
+        "res53":  resnet53,
+        "res103": resnet103,
+        "res50":  resnet50,
+        "res101": resnet101
     }[resnet]
 
     net = SmallDeepLab(
@@ -28,9 +31,6 @@ def deeplabv3(pretrained=False, resnet="res103", head_in_ch=2048, num_classes=21
         ),
         classifier=DeepLabHead(head_in_ch, num_classes)
     )
-    if pretrained:
-        state_dict = load_state_dict_from_url('https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth', progress=True)
-        net.load_state_dict(state_dict)
     return net
 
 
diff --git a/model/pointrend.py b/model/pointrend.py
@@ -37,8 +37,11 @@ def forward(self, x, res2, out):
 
     @torch.no_grad()
     def inference(self, x, res2, out):
-        stride = x.shape[-1] // out.shape[-1]
-        num_points = x.shape[-1] // stride
+        """
+        During inference, subdivision uses N=8096
+        (i.e., the number of points in the stride 16 map of a 1024×2048 image)
+        """
+        num_points = 8096
 
         while out.shape[-1] != x.shape[-1]:
             out = F.interpolate(out, scale_factor=2, mode="bilinear", align_corners=True)
@@ -54,9 +57,9 @@ def inference(self, x, res2, out):
 
             B, C, H, W = out.shape
             points_idx = points_idx.unsqueeze(1).expand(-1, C, -1)
-            out = out.reshape(B, C, -1)
-            out = out.scatter_(2, points_idx, rend)
-            out = out.view(B, C, H, W)
+            out = (out.reshape(B, C, -1)
+                      .scatter_(2, points_idx, rend)
+                      .view(B, C, H, W))
 
         return {"fine": out}
 
diff --git a/tests/test_cityscapes.ipynb b/tests/test_cityscapes.ipynb
diff --git a/train.py b/train.py
@@ -16,19 +16,20 @@ def step(epoch, loader, net, optim, device):
     loss_sum = 0
     for i, (x, gt) in enumerate(loader):
         x = x.to(device, non_blocking=True)
-        gt = gt.squeeze(1).to(device, dtype=torch.long, non_blocking=True)
+        gt = gt.squeeze_(1).to(device, dtype=torch.long, non_blocking=True)
 
         result = net(x)
 
         pred = F.interpolate(result["coarse"], x.shape[-2:], mode="bilinear", align_corners=True)
-        seg_loss = F.cross_entropy(pred, gt)
+        seg_loss = F.cross_entropy(pred, gt, ignore_index=255)
 
         gt_points = point_sample(
             gt.float().unsqueeze(1),
             result["points"],
+            mode="nearest",
             align_corners=False
         ).squeeze_(1).long()
-        points_loss = F.cross_entropy(result["rend"], gt_points)
+        points_loss = F.cross_entropy(result["rend"], gt_points, ignore_index=255)
 
         loss = seg_loss + points_loss
 
diff --git a/utils/metrics.py b/utils/metrics.py
@@ -1,35 +1,21 @@
 import torch
+
 from utils.gpus import reduce_tensor
 
 
 class ConfusionMatrix:
-    """
-    Reference : https://discuss.pytorch.org/t/how-to-check-and-read-confusion-matrix/41835/10
-    """
-    def __init__(self, num_classes):
+    def __init__(self, num_classes, ignore_index=255):
         self.N = num_classes
-        self.data = torch.zeros(self.N, self.N)
+        self.ignore_index = ignore_index
+        self.cm = torch.zeros(self.N, self.N, dtype=torch.float)
 
     def update(self, pred, gt):
-        N = pred.shape[0]
-        confusion_matrix = torch.zeros_like(self.data, device=pred.device)
-        for p, t in zip(pred.view(N, -1), gt.view(N, -1)):
-            confusion_matrix[p, t] += 1
-
-        self.data += reduce_tensor(confusion_matrix, False).cpu()
+        idx = (gt != self.ignore_index)
+        indices = self.N * gt[idx] + pred[idx]
+        # cpu version is faster
+        self.cm += torch.bincount(indices.cpu(), minlength=self.N**2).reshape(self.N, self.N)
 
     def mIoU(self):
-        mIoU = 0
-        TP_all = self.data.diag()
-        for n in range(self.N):
-            idx = torch.ones(self.N, dtype=torch.long)
-            idx[n] = 0
-
-            TP = TP_all[n]
-            TN = self.data[idx.nonzero()[:, None], idx.nonzero()].sum()
-            FP = self.data[n, idx].sum()
-            FN = self.data[idx, n].sum()
-
-            dice = (2 * TP) / (2 * TP + FP + FN + 1e-8)
-            mIoU += dice
-        return mIoU / self.N
+        cm = reduce_tensor(self.cm.cuda(), False)
+        iou = cm.diag() / (cm.sum(dim=1) + cm.sum(dim=0) - cm.diag() + 1e-15)
+        return iou.mean().cpu().item()