RNN运算过程如下图所示。在RNN中,对于一个样本,我们每次只输入一个单词$x^{< t >}$,得到一个输出$y^{< t >}$。除了输出$y^{< t >}$外,神经网络还会把中间激活输出$a^{< t >}$传递给下一轮计算,这个$a^{< t >}$记录了之前单词的某些信息。所有的输出按照这种方法依次计算。当然,第一轮计算时也会用到激活输出$a^{< 0 >}$,简单地令$a^{< 0 >}$为零张量即可。注意,所有的计算都是用同一个权重一样的神经网络。
语言模型是NLP中的一个基础任务。一个语言模型能够输出某种语言某句话的出现概率。通过比较不同句子的出现概率,我们能够开发出很多应用。比如在英语里,同音的”apple and pear”比”apple and pair”的出现概率高(更可能是一个合理的句子)。当一个语音识别软件听到这句话时,可以分别写下这两句发音相近的句子,再根据语言模型断定这句话应该写成前者。
由于语料库中包含的是自然语言,而RNN的输入是one-hot编码,所以这中间要经过一个预处理的步骤。在NLP中,这一步骤叫做符号化(tokenize)。如我们在「符号标记」一节所学的,我们可以找来一个大小为10000的词汇表,根据每个单词在词汇表中的位置,生成一个one-hot编码。除了普通的词汇外,NLP中还有一些特殊的符号,比如表示句尾的<EOS> (End Of Sentence),表示词汇表里没有的词的<UNK> (Unknown)。
这个计算过程初次接触时有些令人费解,我们慢慢来看懂它。先竖着看一轮计算是怎么完成的。对于每一轮计算,都会给定一个单词编码$x^{< i >}$,输出一个softmax后的概率分布$\hat{y}^{< i >}$,它要对齐的训练标签是训练集某一句话的某个单词$y^{< i >}$。$\hat{y}$表示接收之前所有的输入单词后,此时刻应该输出某单词的概率分布,这个输出的含义和多分类中的类似。
我们刚刚学过,在计算一句话的概率时,RNN会把句子里的每一个单词输入,输出单词出现在前几个单词之后的概率分布$\hat{y}$。反过来想,我们可以根据RNN输出的概率分布,随机采样出某一个单词的下一个单词出来。具体来说,我们先随机生成句子里的第一个单词,把它输入RNN。再用RNN生成概率分布,对概率分布采样出下一个单词,采样出一个单词就输入一个单词,直到采样出< EOS >。这个过程就好像是在让AI生成句子一样。
语言模型是NLP中的一个基础任务。假设我们以单词为基本元素,句子为序列,那么一个语言模型能够输出某句话的出现概率。通过比较不同句子的出现概率,我们能够开发出很多应用。比如在英语里,同音的”apple and pear”比”apple and pair”的出现概率高(更可能是一个合理的句子)。当一个语音识别软件听到这句话时,可以分别写下这两句发音相近的句子,再根据语言模型断定这句话应该写成前者。
words = re.sub(u'([^\u0020\u0061-\u007a])', '', words)
这样,一个读取词汇表文件的函数就长这样:
1 2 3 4 5 6 7 8 9
defread_imdb_vocab(dir='data/aclImdb'): fn = os.path.join(dir, 'imdb.vocab') withopen(fn, 'rb') as f: word = f.read().decode('utf-8').replace('\n', ' ') words = re.sub(u'([^\u0020\u0061-\u007a])', '', word.lower()).split(' ') filtered_words = [w for w in words iflen(w) > 0]
lines = read_imdb() print('Length of the file:', len(lines)) print('lines[0]:', lines[0]) words = read_imdb_words(n_files=100) print('Length of the words:', len(words)) for i inrange(5): print(words[i])
text
1 2 3 4 5 6 7 8 9 10
the and Length of the file: 12500 lines[0]: Bromwell High is a cartoon ... Length of the words: 23425 bromwell high is a cartoon
def__getitem__(self, index): """return the (one-hot) encoding vector of a word""" word = self.words[index] + ' ' word_length = len(word) if self.is_onehot: tensor = torch.zeros(self.max_length, EMBEDDING_LENGTH) for i inrange(self.max_length): if i < word_length: tensor[i][LETTER_MAP[word[i]]] = 1 else: tensor[i][0] = 1 else: tensor = torch.zeros(self.max_length, dtype=torch.long) for i inrange(word_length): tensor[i] = LETTER_MAP[word[i]]
def__getitem__(self, index): """return the (one-hot) encoding vector of a word""" word = self.words[index] + ' ' word_length = len(word) if self.is_onehot: tensor = torch.zeros(self.max_length, EMBEDDING_LENGTH) for i inrange(self.max_length): if i < word_length: tensor[i][LETTER_MAP[word[i]]] = 1 else: tensor[i][0] = 1 else: tensor = torch.zeros(self.max_length, dtype=torch.long) for i inrange(word_length): tensor[i] = LETTER_MAP[word[i]]
return tensor
注意!短单词的填充部分应该全是空字符。千万不要忘记给空字符的one-hot编码赋值。
1 2 3 4 5
for i inrange(self.max_length): if i < word_length: tensor[i][LETTER_MAP[word[i]]] = 1 else: tensor[i][0] = 1
defget_dataloader_and_max_length(limit_length=None, is_onehot=True, is_vocab=True): if is_vocab: words = read_imdb_vocab() else: words = read_imdb_words(n_files=200)
max_length = 0 for word in words: max_length = max(max_length, len(word))
if limit_length isnotNoneand max_length > limit_length: words = [w for w in words iflen(w) <= limit_length] max_length = limit_length
a = torch.zeros(batch, self.hidden_units, device=word.device) x = torch.zeros(batch, EMBEDDING_LENGTH, device=word.device) for i inrange(Tx): next_a = self.tanh(self.linear_a(torch.cat((a, x), 1))) hat_y = self.linear_y(next_a) output[i] = hat_y x = word[i] a = next_a
a = torch.zeros(batch, self.hidden_units, device=word.device) x = torch.zeros(batch, EMBEDDING_LENGTH, device=word.device) for i inrange(Tx): next_a = self.tanh(self.linear_a(torch.cat((a, x), 1))) hat_y = self.linear_y(next_a) output[i] = hat_y x = word[i] a = next_a
#define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CPU(x) \ TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CUDA_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) #define CHECK_CPU_INPUT(x) \ CHECK_CPU(x); \ CHECK_CONTIGUOUS(x)
#define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) \ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define THREADS_PER_BLOCK 512
inlineintGET_BLOCKS(constint N, constint num_threads = THREADS_PER_BLOCK){ int optimal_block_num = (N + num_threads - 1) / num_threads; int max_block_num = 4096; returnmin(optimal_block_num, max_block_num); }
template <typename T> __device__ T bilinear_interpolate(const T* input, constint height, constint width, T y, T x, constint index /* index for debug only*/){ // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) return0;
if (y <= 0) y = 0; if (x <= 0) x = 0;
int y_low = (int)y; int x_low = (int)x; int y_high; int x_high;
voidmy_conv_shape_check(at::Tensor input, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group) { TORCH_CHECK( weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s", weight.ndimension());
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3));
TORCH_CHECK(dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
TORCH_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW);
int ndim = input.ndimension(); int dimf = 0; int dimh = 1; int dimw = 2;
if (ndim == 4) { dimf++; dimh++; dimw++; }
TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim);
long nInputPlane = weight.size(1) * group; long inputHeight = input.size(dimh); long inputWidth = input.size(dimw); long nOutputPlane = weight.size(0); long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
if (outputWidth < 1 || outputHeight < 1) AT_ERROR( "Given input size: (%ld x %ld x %ld). " "Calculated output size: (%ld x %ld x %ld). Output size is too small", nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth);
TORCH_CHECK(input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1));
TORCH_CHECK((inputHeight >= kH && inputWidth >= kW), "input image is smaller than kernel"); }
voidmy_conv_forward(Tensor input, Tensor weight, Tensor bias, Tensor output, Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int im2col_step) { bool isCuda = false; if (input.device().is_cuda()) { CHECK_CUDA_INPUT(input); CHECK_CUDA_INPUT(weight); CHECK_CUDA_INPUT(bias); CHECK_CUDA_INPUT(output); CHECK_CUDA_INPUT(columns); isCuda = true; } else { CHECK_CPU_INPUT(input); CHECK_CPU_INPUT(weight); CHECK_CPU_INPUT(bias); CHECK_CPU_INPUT(output); CHECK_CPU_INPUT(columns); }
voidmy_conv_forward(Tensor input, Tensor weight, Tensor bias, Tensor output, Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int im2col_step)
import torch from torch.autograd import Function import torch.nn as nn from torch import Tensor from torch.nn.modules.utils import _pair from torch.nn.parameter import Parameter
// Modify from https://github.com/open-mmlab/mmcv/blob/my_conv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh // Copyright (c) OpenMMLab. All rights reserved. #include<torch/types.h>
defnms(predicts: np.ndarray, score_thresh: float = 0.6, iou_thresh: float = 0.3): """Non-Maximum Suppression Args: predicts (np.ndarray): Tensor of shape [n, 5]. The second demesion includes 1 probability and 4 numbers x, y, w, h denoting a bounding box. score_thresh (float): The boxes with probability lower than score_threash will be discarded. iou_thresh (float): The threshold determining whether two boxes are "overlapped". Returns: (np.ndarray, List[int]): The filtered predictions and the indices of remaining boxes. """
# Suppress for i, p inenumerate(predicts): ifnot vis[i] and i != max_index: if iou(p[1:5], max_p[1:5]) > iou_thresh: vis[i] = True n_remainder -= 1 vis[max_index] = True n_remainder -= 1
while n_remainder > 0: max_pro = -1 max_index = 0 # Find argmax for i, p inenumerate(predicts): ifnot vis[i]: if max_pro < p[0]: max_index = i max_pro = p[0]
之后,抑制掉和概率最大框“重合”的框。
1 2 3 4 5 6 7 8 9 10 11
while n_remainder > 0: # Find argmax ...
max_p = predicts[max_index] # Suppress for i, p inenumerate(predicts): ifnot vis[i] and i != max_index: if iou(p[1:5], max_p[1:5]) > iou_thresh: vis[i] = True n_remainder -= 1
from mmdet.apis import inference_detector, init_detector, show_result_pyplot from mmdet.models.detectors import BaseDetector
# Choose to use a config and initialize the detector config = 'configs/yolo/yolov3_mobilenetv2_320_300e_coco.py' # Setup a checkpoint file to load checkpoint = 'checkpoints/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth' # initialize the detector model: BaseDetector = init_detector(config, checkpoint, device='cuda:0')
img = 'demo/demo.jpg' result = inference_detector(model, img)
# Input and preprocessing input predicts of size [19, 19, 5] resize predicts to [361, 5]
# Filter predicts with low probability filtered_predicts = [] for predict in predicts: # drop p_c < 0.6 if predict[0] >= 0.6: filtered_predicts.append(predict)
# NMS n_remainder = len(filtered_predicts) vis = [False] * n_remainder # False for unvisited item output_predicts = [] while n_remainder > 0: max_pro = -1 max_index = 0 # Find argmax for i, p inenumerate(filtered_predicts): ifnot vis[i]: if max_pro < p[0]: max_index = i max_pro = p[0]
# Suppress for i, p inenumerate(filtered_predicts): ifnot vis[i] and i != max_index: if get_IoU(p[1:5], max_p[1:5]) > 0.5: vis[i] = True n_remainder -= 1 vis[max_index] = True n_remainder -= 1
# Get output x = layers.Conv2D(64, 7, (2, 2), padding='same')(input) x = layers.MaxPool2D((3, 3), (2, 2))(x)
if model_name == 'ResNet18': x = identity_block_2(x, 3, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 128, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 256, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) x = convolution_block_2(x, 3, 512, 2, use_shortcut) x = identity_block_2(x, 3, use_shortcut) elif model_name == 'ResNet50':
defblock_group(x, fs1, fs2, count): x = convolution_block_3(x, 3, fs1, fs2, 2, use_shortcut) for i inrange(count - 1): x = identity_block_3(x, 3, fs1, fs2, use_shortcut) return x
x = block_group(x, 64, 256, 3) x = block_group(x, 128, 512, 4) x = block_group(x, 256, 1024, 6) x = block_group(x, 512, 2048, 3) else: raise NotImplementedError(f'No such model {model_name}')
x = layers.AveragePooling2D((2, 2), (2, 2))(x) x = layers.Flatten()(x) output = layers.Dense(1, 'sigmoid')(x)
# Build model model = models.Model(inputs=input, outputs=output) print(model.summary()) return model
x = layers.AveragePooling2D((2, 2), (2, 2))(x) x = layers.Flatten()(x) output = layers.Dense(1, 'sigmoid')(x)
残差块实现
1 2 3 4 5 6 7 8 9 10 11 12
defidentity_block_2(x, f, use_shortcut=True): _, _, _, C = x.shape x_shortcut = x x = layers.Conv2D(C, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(C, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13 14
defconvolution_block_2(x, f, filters, s: int, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters, f, strides=(s, s), padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x_shortcut = layers.Conv2D(filters, 1, strides=(s, s), padding='valid')(x_shortcut) x_shortcut = layers.BatchNormalization(axis=3)(x_shortcut) x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13
defidentity_block_3(x, f, filters1, filters2, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters1, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.Conv2D(filters1, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters2, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x = x + x_shortcut x = layers.ReLU()(x) return x
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defconvolution_block_3(x, f, filters1, filters2, s: int, use_shortcut=True): x_shortcut = x x = layers.Conv2D(filters1, 1, strides=(s, s), padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.Conv2D(filters1, f, padding='same')(x) x = layers.BatchNormalization(axis=3)(x) x = layers.ReLU()(x) x = layers.Conv2D(filters2, 1, padding='valid')(x) x = layers.BatchNormalization(axis=3)(x) if use_shortcut: x_shortcut = layers.Conv2D(filters2, 1, strides=(s, s), padding='same')(x_shortcut) x_shortcut = layers.BatchNormalization(axis=3)(x_shortcut) x = x + x_shortcut x = layers.ReLU()(x) return x