BottleneckLSTM

BottleneckLSTM,源自CVPR2017的Google论文 Mobile Video Object Detection with Temporally-Aware Feature Maps
其相较于ConvLSTM,最大的改进是引入Bottleneck Gate,整合ht-1跟input,减少卷积次数及模型参数量,从而达到加速的目的。
Bottleneck

根据上图,我们看着1×1 --> 3×3这段通路,来做个计算,假设输入feature map的维度为256维,要求输出维度也是256维。有以下两种操作:
1. 256维的输入直接经过一个3×3×256的卷积层,输出一个256维的feature map,那么参数量为:256×3×3×256 = 589,824
2. 256维的输入先经过一个1×1×64的卷积层,再经过一个3×3×64的卷积层,最后经过一个3×3×256的卷积层,输出256维,参数量为:256×1×1×64 + 64×3×3×63 + 64×1×1×256 = 69,632。足足把第一种操作的参数量降低到九分之一!
1×1卷积核也被认为是影响深远的操作,往后大型的网络为了降低参数量都会应用上1×1卷积核。
实现源码:
class BottleneckLSTMCell(nn.Module):
""" Creates a LSTM layer cell
Arguments:
input_channels : variable used to contain value of number of channels in input
hidden_channels : variable used to contain value of number of channels in the hidden state of LSTM cell
"""
def __init__(self, input_channels, hidden_channels):
super(BottleneckLSTMCell, self).__init__()
assert hidden_channels % 2 == 0
self.input_channels = int(input_channels)
self.hidden_channels = int(hidden_channels)
self.num_features = 4
self.W = nn.Conv2d(in_channels=self.input_channels, out_channels=self.input_channels, kernel_size=3, groups=self.input_channels, stride=1, padding=1)
self.Wy = nn.Conv2d(int(self.input_channels+self.hidden_channels), self.hidden_channels, kernel_size=1)
self.Wi = nn.Conv2d(self.hidden_channels, self.hidden_channels, 3, 1, 1, groups=self.hidden_channels, bias=False)
self.Wbi = nn.Conv2d(self.hidden_channels, self.hidden_channels, 1, 1, 0, bias=False)
self.Wbf = nn.Conv2d(self.hidden_channels, self.hidden_channels, 1, 1, 0, bias=False)
self.Wbc = nn.Conv2d(self.hidden_channels, self.hidden_channels, 1, 1, 0, bias=False)
self.Wbo = nn.Conv2d(self.hidden_channels, self.hidden_channels, 1, 1, 0, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
logging.info("Initializing weights of lstm")
self._initialize_weights()
def _initialize_weights(self):
"""
Returns:
initialized weights of the model
"""
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x, h, c): #implemented as mentioned in paper here the only difference is Wbi, Wbf, Wbc & Wbo are commuted all together in paper
"""
Arguments:
x : input tensor
h : hidden state tensor
c : cell state tensor
Returns:
output tensor after LSTM cell
"""
x = self.W(x)
y = torch.cat((x, h),1) #concatenate input and hidden layers 主要改善点
i = self.Wy(y) #reduce to hidden layer size
b = self.Wi(i) #depth wise 3*3
ci = torch.sigmoid(self.Wbi(b) + c * self.Wci)
cf = torch.sigmoid(self.Wbf(b) + c * self.Wcf)
cc = cf * c + ci * torch.relu(self.Wbc(b))
co = torch.sigmoid(self.Wbo(b) + cc * self.Wco)
ch = co * torch.relu(cc)
return ch, cc
def init_hidden(self, batch_size, hidden, shape):
"""
Arguments:
batch_size : an int variable having value of batch size while training
hidden : an int variable having value of number of channels in hidden state
shape : an array containing shape of the hidden and cell state
Returns:
cell state and hidden state
"""
if self.Wci is None:
self.Wci = Variable(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = Variable(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = Variable(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda()
)