#define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CPU(x) \ TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CUDA_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) #define CHECK_CPU_INPUT(x) \ CHECK_CPU(x); \ CHECK_CONTIGUOUS(x)
#define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) \ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define THREADS_PER_BLOCK 512
inlineintGET_BLOCKS(constint N, constint num_threads = THREADS_PER_BLOCK){ int optimal_block_num = (N + num_threads - 1) / num_threads; int max_block_num = 4096; returnmin(optimal_block_num, max_block_num); }
template <typename T> __device__ T bilinear_interpolate(const T* input, constint height, constint width, T y, T x, constint index /* index for debug only*/){ // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) return0;
if (y <= 0) y = 0; if (x <= 0) x = 0;
int y_low = (int)y; int x_low = (int)x; int y_high; int x_high;
voidmy_conv_shape_check(at::Tensor input, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group) { TORCH_CHECK( weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s", weight.ndimension());
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3));
TORCH_CHECK(dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
TORCH_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW);
int ndim = input.ndimension(); int dimf = 0; int dimh = 1; int dimw = 2;
if (ndim == 4) { dimf++; dimh++; dimw++; }
TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim);
long nInputPlane = weight.size(1) * group; long inputHeight = input.size(dimh); long inputWidth = input.size(dimw); long nOutputPlane = weight.size(0); long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
if (outputWidth < 1 || outputHeight < 1) AT_ERROR( "Given input size: (%ld x %ld x %ld). " "Calculated output size: (%ld x %ld x %ld). Output size is too small", nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth);
TORCH_CHECK(input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1));
TORCH_CHECK((inputHeight >= kH && inputWidth >= kW), "input image is smaller than kernel"); }
voidmy_conv_forward(Tensor input, Tensor weight, Tensor bias, Tensor output, Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int im2col_step) { bool isCuda = false; if (input.device().is_cuda()) { CHECK_CUDA_INPUT(input); CHECK_CUDA_INPUT(weight); CHECK_CUDA_INPUT(bias); CHECK_CUDA_INPUT(output); CHECK_CUDA_INPUT(columns); isCuda = true; } else { CHECK_CPU_INPUT(input); CHECK_CPU_INPUT(weight); CHECK_CPU_INPUT(bias); CHECK_CPU_INPUT(output); CHECK_CPU_INPUT(columns); }
voidmy_conv_forward(Tensor input, Tensor weight, Tensor bias, Tensor output, Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int im2col_step)
import torch from torch.autograd import Function import torch.nn as nn from torch import Tensor from torch.nn.modules.utils import _pair from torch.nn.parameter import Parameter
// Modify from https://github.com/open-mmlab/mmcv/blob/my_conv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh // Copyright (c) OpenMMLab. All rights reserved. #include<torch/types.h>
defnms(predicts: np.ndarray, score_thresh: float = 0.6, iou_thresh: float = 0.3): """Non-Maximum Suppression Args: predicts (np.ndarray): Tensor of shape [n, 5]. The second demesion includes 1 probability and 4 numbers x, y, w, h denoting a bounding box. score_thresh (float): The boxes with probability lower than score_threash will be discarded. iou_thresh (float): The threshold determining whether two boxes are "overlapped". Returns: (np.ndarray, List[int]): The filtered predictions and the indices of remaining boxes. """
# Suppress for i, p inenumerate(predicts): ifnot vis[i] and i != max_index: if iou(p[1:5], max_p[1:5]) > iou_thresh: vis[i] = True n_remainder -= 1 vis[max_index] = True n_remainder -= 1
while n_remainder > 0: max_pro = -1 max_index = 0 # Find argmax for i, p inenumerate(predicts): ifnot vis[i]: if max_pro < p[0]: max_index = i max_pro = p[0]
之后,抑制掉和概率最大框“重合”的框。
1 2 3 4 5 6 7 8 9 10 11
while n_remainder > 0: # Find argmax ...
max_p = predicts[max_index] # Suppress for i, p inenumerate(predicts): ifnot vis[i] and i != max_index: if iou(p[1:5], max_p[1:5]) > iou_thresh: vis[i] = True n_remainder -= 1
from mmdet.apis import inference_detector, init_detector, show_result_pyplot from mmdet.models.detectors import BaseDetector
# Choose to use a config and initialize the detector config = 'configs/yolo/yolov3_mobilenetv2_320_300e_coco.py' # Setup a checkpoint file to load checkpoint = 'checkpoints/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth' # initialize the detector model: BaseDetector = init_detector(config, checkpoint, device='cuda:0')
img = 'demo/demo.jpg' result = inference_detector(model, img)
# Input and preprocessing input predicts of size [19, 19, 5] resize predicts to [361, 5]
# Filter predicts with low probability filtered_predicts = [] for predict in predicts: # drop p_c < 0.6 if predict[0] >= 0.6: filtered_predicts.append(predict)
# NMS n_remainder = len(filtered_predicts) vis = [False] * n_remainder # False for unvisited item output_predicts = [] while n_remainder > 0: max_pro = -1 max_index = 0 # Find argmax for i, p inenumerate(filtered_predicts): ifnot vis[i]: if max_pro < p[0]: max_index = i max_pro = p[0]
# Suppress for i, p inenumerate(filtered_predicts): ifnot vis[i] and i != max_index: if get_IoU(p[1:5], max_p[1:5]) > 0.5: vis[i] = True n_remainder -= 1 vis[max_index] = True n_remainder -= 1
# Get output x = layers.Conv2D(64, 7, (2, 2), padding='same')(input) x = layers.MaxPool2D((3, 3), (2, 2))(x)
if model_name == 'ResNet18': x = identity_block_2(x, 3, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 128, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 256, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 512, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) elif model_name == 'ResNet50':
defblock_group(x, fs1, fs2, count): x = convolution_block_3(x, 3, fs1, fs2, 2, use_shortcut) for i inrange(count - 1): x = identity_block_3(x, 3, fs1, fs2, use_shortcut) return x
x = block_group(x, 64, 256, 3) x = block_group(x, 128, 512, 4) x = block_group(x, 256, 1024, 6) x = block_group(x, 512, 2048, 3) else: raise NotImplementedError(f'No such model {model_name}')
x = layers.AveragePooling2D((2, 2), (2, 2))(x) x = layers.Flatten()(x) output = layers.Dense(1, 'sigmoid')(x)
# Build model model = models.Model(inputs=input, outputs=output) print(model.summary()) return model
x = layers.AveragePooling2D((2, 2), (2, 2))(x) x = layers.Flatten()(x) output = layers.Dense(1, 'sigmoid')(x)
残差块实现
1 2 3 4 5 6 7 8 9 10 11 12
defidentity_block_2(x, f, use_shortcut=True): _, _, _, C = x.shape x_shortcut = x x = layers.Conv2D(C, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(C, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13 14
defconvolution_block_2(x, f, filters, s: int, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters, f, strides=(s, s), padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x_shortcut = layers.Conv2D(filters, 1, strides=(s, s), padding='valid')(x_shortcut) x_shortcut = layers.BatchNormalization(axis=3)(x_shortcut) x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13
defidentity_block_3(x, f, filters1, filters2, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters1, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.Conv2D(filters1, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters2, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defconvolution_block_3(x, f, filters1, filters2, s: int, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters1, 1, strides=(s, s), padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.Conv2D(filters1, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters2, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x_shortcut = layers.Conv2D(filters2, 1, strides=(s, s), padding='same')(x_shortcut) x_shortcut = layers.BatchNormalization(axis=3)(x_shortcut) x = x + x_shortcut x = layers.ReLU()(x) return x
defconv2d_backward(dZ: np.ndarray, cache: Dict[str, np.ndarray], stride: int, padding: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """2D Convolution Backward Implemented with NumPy Args: dZ: (np.ndarray): The derivative of the output of conv. cache (Dict[str, np.ndarray]): Record output 'Z', weight 'W', bias 'b' and input 'A_prev' of forward function. stride (int): Stride for convolution. padding (int): The count of zeros to pad on both sides. Outputs: Tuple[np.ndarray, np.ndarray, np.ndarray]: The derivative of W, b, A_prev. """
defconv2d_backward(dZ: np.ndarray, cache: Dict[str, np.ndarray], stride: int, padding: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """2D Convolution Backward Implemented with NumPy Args: dZ: (np.ndarray): The derivative of the output of conv. cache (Dict[str, np.ndarray]): Record output 'Z', weight 'W', bias 'b' and input 'A_prev' of forward function. stride (int): Stride for convolution. padding (int): The count of zeros to pad on both sides. Outputs: Tuple[np.ndarray, np.ndarray, np.ndarray]: The derivative of W, b, A_prev. """ W = cache['W'] b = cache['b'] A_prev = cache['A_prev'] dW = np.zeros(W.shape) db = np.zeros(b.shape) dA_prev = np.zeros(A_prev.shape)