0%

DenseNet 解读和MXNET编写

DenseNet architectures

DenseNet Architectures.

DenseNet 由稠密块(dense block)和过渡层(transition layer)组成,DenseNet 和ResNet的区别在于在于主干和shortcut的连接不再是简单的相加,而是在channel维度上进行拼接(concat),所以为了防止channel通道数过大,就有过渡层来控制(一般是进行channel的减半)。

经过一个Dense Block,通道数增加了多少,引出增长率的概念,论文的符号是k

过渡层一般使用 1x1 的卷积减半通道数,使用stride=2的平均池化层减半高宽,控制模型复杂度。

ResNet 使用4个残差块,DenseNet 使用四个稠密块,3个过渡层。

系列取名原由:

121 = 1+2*(6+12+24+16)+3+1

1:网络头部的7x7的卷积

2*(6+12+24+16) :dense block 的卷积块的卷积个数

3 :transition layer 的卷积个数*

1 :最后一层全连接层

其余类似推导。

下面是Gluon实现部分,四个版本综合起来了,加入了自己的一些理解,以及绘制网络结构图。

具体参考Insightface DenseNet.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# coding: gbk
# pylint: disable= arguments-differ
"""DenseNet, implemented in Gluon."""

import sys
import os
import mxnet as mx
import mxnet.ndarray as nd
import mxnet.gluon as gluon
import mxnet.gluon.nn as nn
import mxnet.autograd as ag
import symbol_utils

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from config import config


def Act():
if config.net_act == 'prelu':
return nn.PReLU()
else:
return nn.Activation(config.net_act)


# Helpers
def _make_dense_block(num_layers, bn_size, growth_rate, dropout, stage_index):
out = nn.HybridSequential(prefix='stage%d_' % stage_index)
with out.name_scope():
for _ in range(num_layers):
out.add(_make_dense_layer(growth_rate, bn_size, dropout))
return out


def _make_dense_layer(growth_rate, bn_size, dropout): # 卷积核的输出通道维数在选定bn_size和growth_rate就已经完全确定了,所有的block都是这个
new_features = nn.HybridSequential(prefix='')
new_features.add(nn.BatchNorm())
# new_features.add(nn.Activation('relu'))
new_features.add(Act())
new_features.add(nn.Conv2D(bn_size * growth_rate, kernel_size=1, use_bias=False)) # 4*32=128

new_features.add(nn.BatchNorm())
# new_features.add(nn.Activation('relu'))
new_features.add(Act())
new_features.add(nn.Conv2D(growth_rate, kernel_size=3, padding=1, use_bias=False)) # 32
if dropout:
new_features.add(nn.Dropout(dropout))

out = gluon.contrib.nn.HybridConcurrent(axis=1, prefix='')
out.add(gluon.contrib.nn.Identity())
out.add(new_features)

return out


def _make_transition(num_output_features):
out = nn.HybridSequential(prefix='')
out.add(nn.BatchNorm())
# out.add(nn.Activation('relu'))
out.add(Act())
out.add(nn.Conv2D(num_output_features, kernel_size=1, use_bias=False)) # 输出通道维数减半
out.add(nn.AvgPool2D(pool_size=2, strides=2)) # 长宽减半 # 为啥不是常用的最大池化呢?
return out


# Net
class DenseNet(nn.HybridBlock):
r"""Densenet-BC model from the
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.

Parameters
----------
num_init_features : int
Number of filters to learn in the first convolution layer.
growth_rate : int
Number of filters to add each layer (`k` in the paper).
block_config : list of int
List of integers for numbers of layers in each pooling block.
bn_size : int, default 4
Multiplicative factor for number of bottle neck layers.
(i.e. bn_size * k features in the bottleneck layer)
dropout : float, default 0
Rate of dropout after each dense layer.
classes : int, default 1000
Number of classification classes.
"""

def __init__(self, num_init_features, growth_rate, block_config,
bn_size=4, dropout=0, classes=1000, **kwargs):

super(DenseNet, self).__init__(**kwargs)
with self.name_scope():
self.features = nn.HybridSequential(prefix='')
self.features.add(nn.Conv2D(num_init_features, kernel_size=3,
strides=1, padding=1, use_bias=False)) # 7 2 3 怎么变成了3 2 1
self.features.add(nn.BatchNorm())
self.features.add(nn.Activation('relu'))
self.features.add(nn.MaxPool2D(pool_size=3, strides=2, padding=1))
# Add dense blocks
num_features = num_init_features
for i, num_layers in enumerate(block_config):
self.features.add(_make_dense_block(num_layers, bn_size, growth_rate, dropout, i + 1))
num_features = num_features + num_layers * growth_rate # 64 + 6 * 32 = 256
if i != len(block_config) - 1:
# 0,1,2的时候加入transition block 通道数减半, 长宽通过最大池化也减半
# 4个稠密块,最后一个不加入过渡层,前面三个都加入过渡层
self.features.add(_make_transition(num_features // 2))
num_features = num_features // 2
self.features.add(nn.BatchNorm())
self.features.add(nn.Activation('relu'))
# self.features.add(nn.AvgPool2D(pool_size=7)) # 原论文是这么结束这个CNN架构的,这里修改了
# self.features.add(nn.Flatten())

# self.output = nn.Dense(classes)

def hybrid_forward(self, F, x):
x = self.features(x)
# x = self.output(x)
return x


# Specification 64 = 2*k 32 = k
densenet_spec = {121: (64, 32, [6, 12, 24, 16]),
161: (96, 48, [6, 12, 36, 24]),
169: (64, 32, [6, 12, 32, 32]),
201: (64, 32, [6, 12, 48, 32])}

# Constructor
def get_symbol():
# num_layers = config.num_layers
num_layers = 121
num_init_features, growth_rate, block_config = densenet_spec[num_layers]
net = DenseNet(num_init_features, growth_rate, block_config, dropout=False)#config.densenet_dropout
data = mx.sym.Variable(name='data') # 这里的data的shape是多少不知道啊
data = data - 127.5
data = data * 0.0078125
body = net(data)
fc1 = symbol_utils.get_fc1(body, config.emb_size, config.net_output)
return fc1

if __name__=="__main__":
fc = get_symbol()
digraph = mx.viz.plot_network(fc, title='densenet121', shape={'data': (1,3,112, 112)},node_attrs={"fixedsize": "false"})
digraph.view()

# mx.viz.print_summary(fc, shape={'data': (1,3,112, 112)})

results

生成的pdf文件,DenseNet121总参数为 Total params: 7474048 。下一篇关于ResNet可以比较一下二者的网络参数差别。论文中提到DenseNet的参数比ResNet的要小。

DenseNet优点:

1、减轻了vanishing-gradient(梯度消失)
2、加强了feature的传递
3、更有效地利用了feature
4、一定程度上较少了参数数量

文章提出的DenseNet核心思想在于建立了不同层之间的连接关系,充分利用了feature,进一步减轻了梯度消失问题,加深网络不是问题,而且训练效果非常好。另外,利用Translation layer以及较小的growth rate使得网络变窄,参数减少,有效抑制了过拟合,减少计算量。

参考链接:

https://github.com/deepinsight/insightface/blob/master/recognition/symbol/fdensenet.py

DenseNet算法详解