update backend for PyTorch Update (#130)

zhanghang1989 · web-flow · commit c2cb2aab69d5 · 2018-10-04T12:00:25.000-07:00
* update backend * version fixes #123
diff --git a/LICENSE b/LICENSE
@@ -1,7 +1,7 @@
 MIT License
 
-Copyright (c) 2017 Hang Zhang. All rights reserved.
-Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
+Copyright (c) 2017-     Hang Zhang. All rights reserved.
+Copyright (c) 2018-     Amazon.com, Inc. or its affiliates. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/docs/source/experiments/segmentation.rst b/docs/source/experiments/segmentation.rst
@@ -83,15 +83,15 @@ Test Pre-trained Model
 
 
     <code xml:space="preserve" id="cmd_enc101_ade" style="display: none; text-align: left; white-space: pre-wrap">
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset ADE20K --model EncNet --aux --se-loss --backbone resnet101 
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset ADE20K --model EncNet --aux --se-loss --backbone resnet101 --base-size 640 --crop-size 576
     </code>
 
     <code xml:space="preserve" id="cmd_enc101_voc" style="display: none; text-align: left; white-space: pre-wrap">
     # First finetuning COCO dataset pretrained model on augmented set
     # You can also train from scratch on COCO by yourself
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_aug --model-zoo EncNet_Resnet101_COCO --aux --se-loss --lr 0.001 --syncbn --ngpus 4 --checkname res101
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_aug --model-zoo EncNet_Resnet101_COCO --aux --se-loss --lr 0.001 --syncbn --ngpus 4 --checkname res101 --ft
     # Finetuning on original set
-    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_voc --model encnet --aux  --se-loss --backbone resnet101 --lr 0.0001 --syncbn --ngpus 4 --checkname res101 --resume runs/Pascal_aug/encnet/res101/checkpoint.params
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --dataset Pascal_voc --model encnet --aux  --se-loss --backbone resnet101 --lr 0.0001 --syncbn --ngpus 4 --checkname res101 --resume runs/Pascal_aug/encnet/res101/checkpoint.params --ft
     </code>
 
 Quick Demo
diff --git a/docs/source/experiments/texture.rst b/docs/source/experiments/texture.rst
@@ -22,7 +22,7 @@ Test Pre-trained Model
     cd PyTorch-Encoding/
     python scripts/prepare_minc.py
 
-- Download pre-trained model (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`19.70\%` using single crop on test-1 set)::
+- Download pre-trained model (pre-trained on train-1 split using single training size of 224, with an error rate of :math:`18.96\%` using single crop on test-1 set)::
 
     cd experiments/recognition
     python model/download_models.py
diff --git a/encoding/__init__.py b/encoding/__init__.py
@@ -10,4 +10,4 @@
 
 """An optimized PyTorch package with CUDA backend."""
 from .version import __version__
-from . import nn, functions, dilated, parallel, utils, models, datasets, optimizer
+from . import nn, functions, dilated, parallel, utils, models, datasets
diff --git a/encoding/datasets/cityscapes.py b/encoding/datasets/cityscapes.py
@@ -6,6 +6,7 @@
 
 import os
 import sys
+import random
 import numpy as np
 from tqdm import tqdm, trange
 from PIL import Image, ImageOps, ImageFilter
@@ -93,7 +94,7 @@ def _sync_transform(self, img, mask):
             mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
         crop_size = self.crop_size
         # random scale (short edge from 480 to 720)
-        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.5))
+        short_size = random.randint(int(self.base_size*0.5), int(self.base_size*2.0))
         w, h = img.size
         if h > w:
             ow = short_size
diff --git a/encoding/lib/cpu/nms_cpu.cpp b/encoding/lib/cpu/nms_cpu.cpp
@@ -1,3 +1,4 @@
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 
@@ -42,7 +43,8 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
   
   auto num_boxes = input.size(1);
   auto batch_size = input.size(0);
-  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  auto mask = torch::zeros({batch_size, num_boxes}, input.type().toScalarType(at::kByte));
+  //auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
   mask.fill_(1);
   auto *rawMask = mask.data<unsigned char>();
   auto *rawIdx = sorted_inds.data<int64_t>();
diff --git a/encoding/lib/cpu/roi_align_cpu.cpp b/encoding/lib/cpu/roi_align_cpu.cpp
@@ -1,3 +1,4 @@
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 //#include <omp.h>
 
@@ -404,7 +405,7 @@ at::Tensor ROIAlign_Forward_CPU(
   AT_ASSERT(roi_cols == 4 || roi_cols == 5);
 
   // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto output = input.type().tensor({num_rois, channels, pooled_height, pooled_width});
+  auto output = torch::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
 
   AT_ASSERT(input.is_contiguous());
   AT_ASSERT(bottom_rois.is_contiguous());
@@ -451,7 +452,7 @@ at::Tensor ROIAlign_Backward_CPU(
   AT_ASSERT(roi_cols == 4 || roi_cols == 5);
 
   // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto grad_in = bottom_rois.type().tensor({b_size, channels, height, width}).zero_(); 
+  auto grad_in = torch::zeros({b_size, channels, height, width}, bottom_rois.options());
 
   AT_ASSERT(bottom_rois.is_contiguous());
 
diff --git a/encoding/lib/cpu/syncbn_cpu.cpp b/encoding/lib/cpu/syncbn_cpu.cpp
@@ -1,3 +1,4 @@
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <vector>
 
@@ -45,8 +46,8 @@ std::vector<at::Tensor> BatchNorm_Backward_CPU(
 std::vector<at::Tensor> Sum_Square_Forward_CPU(
     const at::Tensor input) {
   /* outputs */
-  at::Tensor sum = input.type().tensor({input.size(1)}).zero_();
-  at::Tensor square = input.type().tensor({input.size(1)}).zero_();
+  at::Tensor sum = torch::zeros({input.size(1)}, input.options());
+  at::Tensor square = torch::zeros({input.size(1)}, input.options());
   return {sum, square};
 }
 
diff --git a/encoding/lib/gpu/encoding_kernel.cu b/encoding/lib/gpu/encoding_kernel.cu
@@ -1,4 +1,5 @@
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
@@ -165,7 +166,7 @@ at::Tensor Aggregate_Forward_CUDA(
     const at::Tensor X_,
     const at::Tensor C_) {
   /* Device tensors */
-  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   // B, K, D
   dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
@@ -214,7 +215,7 @@ at::Tensor ScaledL2_Forward_CUDA(
     const at::Tensor X_,
     const at::Tensor C_,
     const at::Tensor S_) {
-  auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto SL_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
   dim3 threads(getNumThreads(C_.size(1)));
diff --git a/encoding/lib/gpu/encodingv2_kernel.cu b/encoding/lib/gpu/encodingv2_kernel.cu
@@ -1,4 +1,5 @@
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/Functions.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -239,7 +240,7 @@ at::Tensor Encoding_Dist_Inference_Forward_CUDA(
     const at::Tensor STD_) {
     // const at::Tensor S_,
   // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
   // E(x), E(x^2)
   int N = X_.size(0) * X_.size(1);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -301,7 +302,7 @@ std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
     double eps) {
     // const at::Tensor S_,
   // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
   // E(x), E(x^2)
   int N = X_.size(0) * X_.size(1);
   auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
@@ -373,7 +374,7 @@ at::Tensor AggregateV2_Forward_CUDA(
     const at::Tensor C_,
     const at::Tensor STD_) {
   /* Device tensors */
-  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
   // auto IS_ = 1.0f / (S_ + eps).sqrt();
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   // B, K, D
diff --git a/encoding/lib/gpu/nms_kernel.cu b/encoding/lib/gpu/nms_kernel.cu
@@ -1,3 +1,4 @@
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include "ATen/NativeFunctions.h"
 #include <ATen/cuda/CUDAContext.h>
@@ -75,7 +76,8 @@ std::vector<at::Tensor> Non_Max_Suppression_CUDA(
 
   auto num_boxes = input.size(1);
   auto batch_size = input.size(0);
-  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  //auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  auto mask = torch::zeros({batch_size, num_boxes}, input.type().toScalarType(at::kByte));
   mask.fill_(1);
   
   //need the indices of the boxes sorted by score.
diff --git a/encoding/lib/gpu/roi_align_kernel.cu b/encoding/lib/gpu/roi_align_kernel.cu
@@ -1,3 +1,4 @@
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
@@ -367,7 +368,7 @@ at::Tensor ROIAlign_Forward_CUDA(
   auto width = input.size(3);
 
   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
-  auto output = input.type().tensor({proposals, channels, pooled_height, pooled_width});
+  auto output = torch::zeros({proposals, channels, pooled_height, pooled_width}, input.options());
 
   auto count = output.numel();
   
@@ -414,7 +415,7 @@ at::Tensor ROIAlign_Backward_CUDA(
 
   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
   // gradient wrt input features
-  auto grad_in = rois.type().tensor({b_size, channels, height, width}).zero_(); 
+  auto grad_in = torch::zeros({b_size, channels, height, width}, rois.options());
   auto num_rois = rois.size(0);
   auto count = grad_output.numel();
 
diff --git a/encoding/lib/gpu/syncbn_kernel.cu b/encoding/lib/gpu/syncbn_kernel.cu
@@ -1,4 +1,5 @@
 #include <vector>
+#include <torch/tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
@@ -244,8 +245,8 @@ std::vector<at::Tensor> BatchNorm_Backward_CUDA(
 std::vector<at::Tensor> Sum_Square_Forward_CUDA(
     const at::Tensor input_) {
   /* outputs */
-  at::Tensor sum_ = input_.type().tensor({input_.size(1)}).zero_();
-  at::Tensor square_ = input_.type().tensor({input_.size(1)}).zero_();
+  at::Tensor sum_ = torch::zeros({input_.size(1)}, input_.options());
+  at::Tensor square_ = torch::zeros({input_.size(1)}, input_.options());
   /* cuda utils*/
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   dim3 blocks(input_.size(1));
diff --git a/encoding/models/model_store.py b/encoding/models/model_store.py
@@ -11,7 +11,7 @@
     ('2a57e44de9c853fa015b172309a1ee7e2d0e4e2a', 'resnet101'),
     ('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),
     ('2e22611a7f3992ebdee6726af169991bc26d7363', 'deepten_minc'),
-    ('fc8c0b795abf0133700c2d4265d2f9edab7eb6cc', 'fcn_resnet50_ade'),
+    ('662e979de25a389f11c65e9f1df7e06c2c356381', 'fcn_resnet50_ade'),
     ('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
     ('54f70c772505064e30efd1ddd3a14e1759faa363', 'psp_resnet50_ade'),
     ('075195c5237b778c718fd73ceddfa1376c18dfd0', 'deeplab_resnet50_ade'),
diff --git a/experiments/segmentation/option.py b/experiments/segmentation/option.py
@@ -92,15 +92,15 @@ def parse(self):
         if args.epochs is None:
             epoches = {
                 'coco': 30,
-                'citys': 180,
+                'citys': 240,
                 'pascal_voc': 50,
                 'pascal_aug': 50,
                 'pcontext': 80,
                 'ade20k': 120,
             }
             args.epochs = epoches[args.dataset.lower()]
         if args.batch_size is None:
-            args.batch_size = 4 * torch.cuda.device_count()
+            args.batch_size = 16
         if args.test_batch_size is None:
             args.test_batch_size = args.batch_size
         if args.lr is None:
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
 
 cwd = os.path.dirname(os.path.abspath(__file__))
 
-version = '0.5.0'
+version = '0.5.1'
 try:
     sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], 
         cwd=cwd).decode('ascii').strip()