I implement the RPN without LSTM from papers based on faster rcnn code. The result is very well for horizon text. But when add bi-directional lstm layer after last con layer, the model is not converge. And scores are the same for all image. Can anyone met this problem?

`name: "VGG_ILSVRC_16_layers"
layer {
name: 'input-data'
type: 'Python'
top: 'data'
top: 'im_info'
top: 'gt_boxes'
python_param {
module: 'roi_data_layer.layer'
layer: 'RoIDataLayer'
param_str: "'num_classes': 2"
}
}
layer {
name: "conv1_1"
type: "Convolution"
bottom: "data"
top: "conv1_1"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "relu1_1"
type: "ReLU"
bottom: "conv1_1"
top: "conv1_1"
}
layer {
name: "conv1_2"
type: "Convolution"
bottom: "conv1_1"
top: "conv1_2"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
}
}
layer {
name: "relu1_2"
type: "ReLU"
bottom: "conv1_2"
top: "conv1_2"
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1_2"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2_1"
type: "Convolution"
bottom: "pool1"
top: "conv2_1"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "relu2_1"
type: "ReLU"
bottom: "conv2_1"
top: "conv2_1"
}
layer {
name: "conv2_2"
type: "Convolution"
bottom: "conv2_1"
top: "conv2_2"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
}
}
layer {
name: "relu2_2"
type: "ReLU"
bottom: "conv2_2"
top: "conv2_2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2_2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv3_1"
type: "Convolution"
bottom: "pool2"
top: "conv3_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_1"
type: "ReLU"
bottom: "conv3_1"
top: "conv3_1"
}
layer {
name: "conv3_2"
type: "Convolution"
bottom: "conv3_1"
top: "conv3_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_2"
type: "ReLU"
bottom: "conv3_2"
top: "conv3_2"
}
layer {
name: "conv3_3"
type: "Convolution"
bottom: "conv3_2"
top: "conv3_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
}
}
layer {
name: "relu3_3"
type: "ReLU"
bottom: "conv3_3"
top: "conv3_3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3_3"
top: "pool3"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv4_1"
type: "Convolution"
bottom: "pool3"
top: "conv4_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_1"
type: "ReLU"
bottom: "conv4_1"
top: "conv4_1"
}
layer {
name: "conv4_2"
type: "Convolution"
bottom: "conv4_1"
top: "conv4_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_2"
type: "ReLU"
bottom: "conv4_2"
top: "conv4_2"
}
layer {
name: "conv4_3"
type: "Convolution"
bottom: "conv4_2"
top: "conv4_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu4_3"
type: "ReLU"
bottom: "conv4_3"
top: "conv4_3"
}
layer {
name: "pool4"
type: "Pooling"
bottom: "conv4_3"
top: "pool4"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv5_1"
type: "Convolution"
bottom: "pool4"
top: "conv5_1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_1"
type: "ReLU"
bottom: "conv5_1"
top: "conv5_1"
}
layer {
name: "conv5_2"
type: "Convolution"
bottom: "conv5_1"
top: "conv5_2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_2"
type: "ReLU"
bottom: "conv5_2"
top: "conv5_2"
}
layer {
name: "conv5_3"
type: "Convolution"
bottom: "conv5_2"
top: "conv5_3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layer {
name: "relu5_3"
type: "ReLU"
bottom: "conv5_3"
top: "conv5_3"
}

#========= RPN ============
#========= RPN ============

# prepare lstm inputs

layer {
name: "im2col"
bottom: "conv5_3"
top: "im2col"
type: "Im2col"
convolution_param {
pad: 1
kernel_size: 3
}
}
layer {
name: "im2col_transpose"
top: "im2col_transpose"
bottom: "im2col"
type: "Transpose"
transpose_param {
dim: 3
dim: 2
dim: 0
dim: 1
}
}
layer {
name: "lstm_input"
type: "Reshape"
bottom: "im2col_transpose"
top: "lstm_input"
reshape_param {
shape { dim: -1 }
axis: 1
num_axes: 2
}
}

layer {
name: "lstm"
type: "Lstm"
bottom: "lstm_input"
top: "lstm"
lstm_param {
num_output: 128
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
clipping_threshold: 1
}
}

# ===================== rlstm ===================

layer {
name: "lstm-reverse1"
type: "Reverse"
bottom: "lstm_input"
top: "rlstm_input"
reverse_param {
axis: 0
}
}
layer {
name: "rlstm"
type: "Lstm"
bottom: "rlstm_input"
top: "rlstm-output"
lstm_param {
num_output: 128
}
}
layer {
name: "lstm-reverse2"
type: "Reverse"
bottom: "rlstm-output"
top: "rlstm"
reverse_param {
axis: 0
}
}

# merge lstm and rlstm

layer {
name: "merge_lstm_rlstm"
type: "Concat"
bottom: "lstm"
bottom: "rlstm"
top: "merge_lstm_rlstm"
concat_param {
axis: 2
}
}
layer {
name: "lstm_output_reshape"
type: "Reshape"
bottom: "merge_lstm_rlstm"
top: "lstm_output_reshape"
reshape_param {
shape { dim: -1 dim: 1 }
axis: 1
num_axes: 1
}
}

# transpose size of output as (N, C, H, W)

layer {
name: "lstm_output"
type: "Transpose"
bottom: "lstm_output_reshape"
top: "lstm_output"
transpose_param {
dim: 2
dim: 3
dim: 1
dim: 0
}
}
layer {
name: "fc"
bottom: "lstm_output"
top: "fc"
type: "Convolution"
convolution_param {
num_output: 512
kernel_size: 1
}
}
layer {
name: "relu_fc"
type: "ReLU"
bottom: "fc"
top: "fc"
}

layer {
name: "rpn_cls_score"
type: "Convolution"
bottom: "fc"
top: "rpn_cls_score"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
num_output: 20 # 2(bg/fg) * 9(anchors)
kernel_size: 1 pad: 0 stride: 1
}
}

layer {
name: "rpn_bbox_pred"
type: "Convolution"
bottom: "fc"
top: "rpn_bbox_pred"
param { lr_mult: 1.0 }
param { lr_mult: 2.0 }
convolution_param {
num_output: 20 # 4 * 9(anchors)
kernel_size: 1 pad: 0 stride: 1
}
}

layer {
bottom: "rpn_cls_score"
top: "rpn_cls_score_reshape"
name: "rpn_cls_score_reshape"
type: "Reshape"
reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
}

layer {
name: 'rpn-data'
type: 'Python'
bottom: 'rpn_cls_score'
bottom: 'gt_boxes'
bottom: 'im_info'
bottom: 'data'
top: 'rpn_labels'
top: 'rpn_bbox_targets'
top: 'rpn_bbox_inside_weights'
top: 'rpn_bbox_outside_weights'
python_param {
module: 'rpn.anchor_target_layer'
layer: 'AnchorTargetLayer'
param_str: "'feat_stride': 16"
}
}

layer {
name: "rpn_loss_cls"
type: "SoftmaxWithLoss"
bottom: "rpn_cls_score_reshape"
bottom: "rpn_labels"
propagate_down: 1
propagate_down: 0
top: "rpn_cls_loss"
loss_weight: 1
loss_param {
ignore_label: -1
normalize: true
}
}

layer {
name: "rpn_loss_bbox"
type: "SmoothL1Loss"
bottom: "rpn_bbox_pred"
bottom: "rpn_bbox_targets"
bottom: 'rpn_bbox_inside_weights'
bottom: 'rpn_bbox_outside_weights'
top: "rpn_loss_bbox"
loss_weight: 1
smooth_l1_loss_param { sigma: 3.0 }
}
`