convolutional_layer.c 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539
  1. #include "convolutional_layer.h"
  2. #include "utils.h"
  3. #include "batchnorm_layer.h"
  4. #include "im2col.h"
  5. #include "col2im.h"
  6. #include "blas.h"
  7. #include "gemm.h"
  8. #include "box.h"
  9. #include <stdio.h>
  10. #include <time.h>
  11. #ifdef AI2
  12. #include "xnor_layer.h"
  13. #endif
  14. #ifdef __cplusplus
  15. #define PUT_IN_REGISTER
  16. #else
  17. #define PUT_IN_REGISTER register
  18. #endif
  19. #ifndef AI2
  20. #define AI2 0
  21. void forward_xnor_layer(layer l, network_state state);
  22. #endif
  23. void swap_binary(convolutional_layer *l)
  24. {
  25. float *swap = l->weights;
  26. l->weights = l->binary_weights;
  27. l->binary_weights = swap;
  28. #ifdef GPU
  29. swap = l->weights_gpu;
  30. l->weights_gpu = l->binary_weights_gpu;
  31. l->binary_weights_gpu = swap;
  32. #endif
  33. }
  34. void binarize_weights(float *weights, int n, int size, float *binary)
  35. {
  36. int i, f;
  37. for(f = 0; f < n; ++f){
  38. float mean = 0;
  39. for(i = 0; i < size; ++i){
  40. mean += fabs(weights[f*size + i]);
  41. }
  42. mean = mean / size;
  43. for(i = 0; i < size; ++i){
  44. binary[f*size + i] = (weights[f*size + i] > 0) ? mean: -mean;
  45. }
  46. }
  47. }
  48. void binarize_cpu(float *input, int n, float *binary)
  49. {
  50. int i;
  51. for(i = 0; i < n; ++i){
  52. binary[i] = (input[i] > 0) ? 1 : -1;
  53. }
  54. }
  55. void binarize_input(float *input, int n, int size, float *binary)
  56. {
  57. int i, s;
  58. for(s = 0; s < size; ++s){
  59. float mean = 0;
  60. for(i = 0; i < n; ++i){
  61. mean += fabs(input[i*size + s]);
  62. }
  63. mean = mean / n;
  64. for(i = 0; i < n; ++i){
  65. binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
  66. }
  67. }
  68. }
  69. int convolutional_out_height(convolutional_layer l)
  70. {
  71. return (l.h + 2*l.pad - l.size) / l.stride_y + 1;
  72. }
  73. int convolutional_out_width(convolutional_layer l)
  74. {
  75. return (l.w + 2*l.pad - l.size) / l.stride_x + 1;
  76. }
  77. image get_convolutional_image(convolutional_layer l)
  78. {
  79. int h,w,c;
  80. h = convolutional_out_height(l);
  81. w = convolutional_out_width(l);
  82. c = l.n;
  83. return float_to_image(w,h,c,l.output);
  84. }
  85. image get_convolutional_delta(convolutional_layer l)
  86. {
  87. int h,w,c;
  88. h = convolutional_out_height(l);
  89. w = convolutional_out_width(l);
  90. c = l.n;
  91. return float_to_image(w,h,c,l.delta);
  92. }
  93. size_t get_workspace_size32(layer l){
  94. #ifdef CUDNN
  95. if(gpu_index >= 0){
  96. size_t most = 0;
  97. size_t s = 0;
  98. CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
  99. l.srcTensorDesc,
  100. l.weightDesc,
  101. l.convDesc,
  102. l.dstTensorDesc,
  103. l.fw_algo,
  104. &s));
  105. if (s > most) most = s;
  106. CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
  107. l.srcTensorDesc,
  108. l.ddstTensorDesc,
  109. l.convDesc,
  110. l.dweightDesc,
  111. l.bf_algo,
  112. &s));
  113. if (s > most && l.train) most = s;
  114. CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
  115. l.weightDesc,
  116. l.ddstTensorDesc,
  117. l.convDesc,
  118. l.dsrcTensorDesc,
  119. l.bd_algo,
  120. &s));
  121. if (s > most && l.train) most = s;
  122. return most;
  123. }
  124. #endif
  125. if (l.xnor) {
  126. size_t re_packed_input_size = l.c * l.w * l.h * sizeof(float);
  127. size_t workspace_size = (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
  128. if (workspace_size < re_packed_input_size) workspace_size = re_packed_input_size;
  129. return workspace_size;
  130. }
  131. return (size_t)l.out_h*l.out_w*l.size*l.size*(l.c / l.groups)*sizeof(float);
  132. }
  133. size_t get_workspace_size16(layer l) {
  134. #if defined(CUDNN) && defined(CUDNN_HALF)
  135. if (gpu_index >= 0) {
  136. size_t most = 0;
  137. size_t s = 0;
  138. CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
  139. l.srcTensorDesc16,
  140. l.weightDesc16,
  141. l.convDesc,
  142. l.dstTensorDesc16,
  143. l.fw_algo16,
  144. &s));
  145. if (s > most) most = s;
  146. CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
  147. l.srcTensorDesc16,
  148. l.ddstTensorDesc16,
  149. l.convDesc,
  150. l.dweightDesc16,
  151. l.bf_algo16,
  152. &s));
  153. if (s > most && l.train) most = s;
  154. CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
  155. l.weightDesc16,
  156. l.ddstTensorDesc16,
  157. l.convDesc,
  158. l.dsrcTensorDesc16,
  159. l.bd_algo16,
  160. &s));
  161. if (s > most && l.train) most = s;
  162. return most;
  163. }
  164. #endif
  165. return 0;
  166. //if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
  167. //return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
  168. }
  169. size_t get_convolutional_workspace_size(layer l) {
  170. size_t workspace_size = get_workspace_size32(l);
  171. size_t workspace_size16 = get_workspace_size16(l);
  172. if (workspace_size16 > workspace_size) workspace_size = workspace_size16;
  173. return workspace_size;
  174. }
  175. #ifdef GPU
  176. #ifdef CUDNN
  177. void create_convolutional_cudnn_tensors(layer *l)
  178. {
  179. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normTensorDesc));
  180. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
  181. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc));
  182. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc));
  183. CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc));
  184. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc));
  185. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc));
  186. CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc));
  187. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDescF16));
  188. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc16));
  189. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc16));
  190. CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc16));
  191. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc16));
  192. CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc16));
  193. CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc16));
  194. CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&l->convDesc));
  195. }
  196. void cudnn_convolutional_setup(layer *l, int cudnn_preference, size_t workspace_size_specify)
  197. {
  198. // CUDNN_HALF
  199. // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
  200. // Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
  201. // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
  202. cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
  203. #if(CUDNN_MAJOR >= 7)
  204. // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
  205. // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
  206. // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
  207. // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
  208. // 1. Accumulation into FP32
  209. // 2. Loss Scaling - required only for: activation gradients. We do not use.
  210. // 3. FP32 Master Copy of Weights
  211. // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
  212. CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups));
  213. CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH));
  214. #if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72) // cuDNN >= 7.2
  215. //CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); // reduces the speed of regular and group convolution
  216. #endif
  217. #else //if(CUDNN_MAJOR >= 7)
  218. if (l->groups > 1) {
  219. error("CUDNN < 7 doesn't support groups, please upgrade!");
  220. }
  221. #endif
  222. // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
  223. // on architectures with DP4A support (compute capability 6.1 and later).
  224. //cudnnDataType_t data_type = CUDNN_DATA_INT8;
  225. // backward delta
  226. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
  227. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
  228. CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
  229. // forward
  230. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
  231. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
  232. CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
  233. //#ifdef CUDNN_HALF
  234. // backward delta
  235. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
  236. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
  237. CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
  238. // forward
  239. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
  240. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
  241. CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
  242. // batch norm
  243. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
  244. //#endif
  245. // batch norm
  246. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1));
  247. CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
  248. //printf("\n l->dilation = %d, l->pad = %d, l->size = %d \n", l->dilation, l->pad, l->size);
  249. #if(CUDNN_MAJOR >= 6)
  250. CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad* l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); // cudnn >= 6.0
  251. #else
  252. CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION)); // cudnn 5.1
  253. #endif
  254. int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
  255. int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
  256. int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
  257. if (cudnn_preference == cudnn_smallest)
  258. {
  259. forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
  260. backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
  261. backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
  262. printf(" CUDNN-slow ");
  263. }
  264. if (cudnn_preference == cudnn_specify)
  265. {
  266. forward_algo = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
  267. backward_algo = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
  268. backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
  269. //printf(" CUDNN-specified %zu ", workspace_size_specify);
  270. }
  271. CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
  272. l->srcTensorDesc,
  273. l->weightDesc,
  274. l->convDesc,
  275. l->dstTensorDesc,
  276. (cudnnConvolutionFwdPreference_t)forward_algo,
  277. workspace_size_specify,
  278. &l->fw_algo));
  279. CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
  280. l->weightDesc,
  281. l->ddstTensorDesc,
  282. l->convDesc,
  283. l->dsrcTensorDesc,
  284. (cudnnConvolutionBwdDataPreference_t)backward_algo,
  285. workspace_size_specify,
  286. &l->bd_algo));
  287. CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
  288. l->srcTensorDesc,
  289. l->ddstTensorDesc,
  290. l->convDesc,
  291. l->dweightDesc,
  292. (cudnnConvolutionBwdFilterPreference_t)backward_filter,
  293. workspace_size_specify,
  294. &l->bf_algo));
  295. //if (data_type == CUDNN_DATA_HALF)
  296. {
  297. // HALF-16 if(data_type == CUDNN_DATA_HALF)
  298. l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
  299. l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
  300. l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
  301. // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
  302. //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
  303. //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
  304. //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
  305. }
  306. }
  307. #endif
  308. #endif
  309. void free_convolutional_batchnorm(convolutional_layer *l)
  310. {
  311. if (!l->share_layer) {
  312. free(l->scales); l->scales = NULL;
  313. free(l->scale_updates); l->scale_updates = NULL;
  314. free(l->mean); l->mean = NULL;
  315. free(l->variance); l->variance = NULL;
  316. free(l->mean_delta); l->mean_delta = NULL;
  317. free(l->variance_delta); l->variance_delta = NULL;
  318. free(l->rolling_mean); l->rolling_mean = NULL;
  319. free(l->rolling_variance); l->rolling_variance = NULL;
  320. free(l->x); l->x = NULL;
  321. free(l->x_norm); l->x_norm = NULL;
  322. #ifdef GPU
  323. cuda_free(l->scales_gpu); l->scales_gpu = NULL;
  324. cuda_free(l->scale_updates_gpu); l->scale_updates_gpu = NULL;
  325. cuda_free(l->mean_gpu); l->mean_gpu = NULL;
  326. cuda_free(l->variance_gpu); l->variance_gpu = NULL;
  327. cuda_free(l->mean_delta_gpu); l->mean_delta_gpu = NULL;
  328. cuda_free(l->variance_delta_gpu); l->variance_delta_gpu = NULL;
  329. cuda_free(l->rolling_mean_gpu); l->rolling_mean_gpu = NULL;
  330. cuda_free(l->rolling_variance_gpu); l->rolling_variance_gpu = NULL;
  331. cuda_free(l->x_gpu); l->x_gpu = NULL;
  332. cuda_free(l->x_norm_gpu); l->x_norm_gpu = NULL;
  333. #endif
  334. }
  335. }
  336. convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train)
  337. {
  338. int total_batch = batch*steps;
  339. int i;
  340. convolutional_layer l = { (LAYER_TYPE)0 };
  341. l.type = CONVOLUTIONAL;
  342. l.train = train;
  343. if (xnor) groups = 1; // disable groups for XNOR-net
  344. if (groups < 1) groups = 1;
  345. const int blur_stride_x = stride_x;
  346. const int blur_stride_y = stride_y;
  347. l.antialiasing = antialiasing;
  348. if (antialiasing) {
  349. stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
  350. }
  351. l.deform = deform;
  352. l.assisted_excitation = assisted_excitation;
  353. l.share_layer = share_layer;
  354. l.index = index;
  355. l.h = h;
  356. l.w = w;
  357. l.c = c;
  358. l.groups = groups;
  359. l.n = n;
  360. l.binary = binary;
  361. l.xnor = xnor;
  362. l.use_bin_output = use_bin_output;
  363. l.batch = batch;
  364. l.steps = steps;
  365. l.stride = stride_x;
  366. l.stride_x = stride_x;
  367. l.stride_y = stride_y;
  368. l.dilation = dilation;
  369. l.size = size;
  370. l.pad = padding;
  371. l.batch_normalize = batch_normalize;
  372. l.learning_rate_scale = 1;
  373. l.nweights = (c / groups) * n * size * size;
  374. if (l.share_layer) {
  375. if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) {
  376. printf("Layer size, nweights, channels or filters don't match for the share_layer");
  377. getchar();
  378. }
  379. l.weights = l.share_layer->weights;
  380. l.weight_updates = l.share_layer->weight_updates;
  381. l.biases = l.share_layer->biases;
  382. l.bias_updates = l.share_layer->bias_updates;
  383. }
  384. else {
  385. l.weights = (float*)xcalloc(l.nweights, sizeof(float));
  386. l.biases = (float*)xcalloc(n, sizeof(float));
  387. if (train) {
  388. l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));
  389. l.bias_updates = (float*)xcalloc(n, sizeof(float));
  390. }
  391. }
  392. // float scale = 1./sqrt(size*size*c);
  393. float scale = sqrt(2./(size*size*c/groups));
  394. for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1); // rand_normal();
  395. int out_h = convolutional_out_height(l);
  396. int out_w = convolutional_out_width(l);
  397. l.out_h = out_h;
  398. l.out_w = out_w;
  399. l.out_c = n;
  400. l.outputs = l.out_h * l.out_w * l.out_c;
  401. l.inputs = l.w * l.h * l.c;
  402. l.activation = activation;
  403. l.output = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
  404. #ifndef GPU
  405. if (train) l.delta = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
  406. #endif // not GPU
  407. l.forward = forward_convolutional_layer;
  408. l.backward = backward_convolutional_layer;
  409. l.update = update_convolutional_layer;
  410. if(binary){
  411. l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
  412. l.cweights = (char*)xcalloc(l.nweights, sizeof(char));
  413. l.scales = (float*)xcalloc(n, sizeof(float));
  414. }
  415. if(xnor){
  416. l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
  417. l.binary_input = (float*)xcalloc(l.inputs * l.batch, sizeof(float));
  418. int align = 32;// 8;
  419. int src_align = l.out_h*l.out_w;
  420. l.bit_align = src_align + (align - src_align % align);
  421. l.mean_arr = (float*)xcalloc(l.n, sizeof(float));
  422. const size_t new_c = l.c / 32;
  423. size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
  424. l.bin_re_packed_input = (uint32_t*)xcalloc(in_re_packed_input_size, sizeof(uint32_t));
  425. l.lda_align = 256; // AVX2
  426. int k = l.size*l.size*l.c;
  427. size_t k_aligned = k + (l.lda_align - k%l.lda_align);
  428. size_t t_bit_input_size = k_aligned * l.bit_align / 8;
  429. l.t_bit_input = (char*)xcalloc(t_bit_input_size, sizeof(char));
  430. }
  431. if(batch_normalize){
  432. if (l.share_layer) {
  433. l.scales = l.share_layer->scales;
  434. l.scale_updates = l.share_layer->scale_updates;
  435. l.mean = l.share_layer->mean;
  436. l.variance = l.share_layer->variance;
  437. l.mean_delta = l.share_layer->mean_delta;
  438. l.variance_delta = l.share_layer->variance_delta;
  439. l.rolling_mean = l.share_layer->rolling_mean;
  440. l.rolling_variance = l.share_layer->rolling_variance;
  441. }
  442. else {
  443. l.scales = (float*)xcalloc(n, sizeof(float));
  444. for (i = 0; i < n; ++i) {
  445. l.scales[i] = 1;
  446. }
  447. if (train) {
  448. l.scale_updates = (float*)xcalloc(n, sizeof(float));
  449. l.mean = (float*)xcalloc(n, sizeof(float));
  450. l.variance = (float*)xcalloc(n, sizeof(float));
  451. l.mean_delta = (float*)xcalloc(n, sizeof(float));
  452. l.variance_delta = (float*)xcalloc(n, sizeof(float));
  453. }
  454. l.rolling_mean = (float*)xcalloc(n, sizeof(float));
  455. l.rolling_variance = (float*)xcalloc(n, sizeof(float));
  456. }
  457. #ifndef GPU
  458. if (train) {
  459. l.x = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
  460. l.x_norm = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
  461. }
  462. #endif // not GPU
  463. }
  464. #ifndef GPU
  465. if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
  466. #endif // not GPU
  467. if(adam){
  468. l.adam = 1;
  469. l.m = (float*)xcalloc(l.nweights, sizeof(float));
  470. l.v = (float*)xcalloc(l.nweights, sizeof(float));
  471. l.bias_m = (float*)xcalloc(n, sizeof(float));
  472. l.scale_m = (float*)xcalloc(n, sizeof(float));
  473. l.bias_v = (float*)xcalloc(n, sizeof(float));
  474. l.scale_v = (float*)xcalloc(n, sizeof(float));
  475. }
  476. #ifdef GPU
  477. l.forward_gpu = forward_convolutional_layer_gpu;
  478. l.backward_gpu = backward_convolutional_layer_gpu;
  479. l.update_gpu = update_convolutional_layer_gpu;
  480. if(gpu_index >= 0){
  481. if (l.activation == SWISH || l.activation == MISH) {
  482. l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*l.outputs);
  483. }
  484. if (l.deform) l.weight_deform_gpu = cuda_make_array(NULL, l.nweights);
  485. if (adam) {
  486. l.m_gpu = cuda_make_array(l.m, l.nweights);
  487. l.v_gpu = cuda_make_array(l.v, l.nweights);
  488. l.bias_m_gpu = cuda_make_array(l.bias_m, n);
  489. l.bias_v_gpu = cuda_make_array(l.bias_v, n);
  490. l.scale_m_gpu = cuda_make_array(l.scale_m, n);
  491. l.scale_v_gpu = cuda_make_array(l.scale_v, n);
  492. }
  493. if (l.share_layer) {
  494. l.weights_gpu = l.share_layer->weights_gpu;
  495. l.weight_updates_gpu = l.share_layer->weight_updates_gpu;
  496. l.weights_gpu16 = l.share_layer->weights_gpu16;
  497. l.weight_updates_gpu16 = l.share_layer->weight_updates_gpu16;
  498. l.biases_gpu = l.share_layer->biases_gpu;
  499. l.bias_updates_gpu = l.share_layer->bias_updates_gpu;
  500. }
  501. else {
  502. l.weights_gpu = cuda_make_array(l.weights, l.nweights);
  503. if (train) l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
  504. #ifdef CUDNN_HALF
  505. l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
  506. if (train) l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
  507. #endif // CUDNN_HALF
  508. l.biases_gpu = cuda_make_array(l.biases, n);
  509. if (train) l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
  510. }
  511. l.output_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
  512. if (train) l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
  513. if(binary){
  514. l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
  515. }
  516. if(xnor){
  517. l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
  518. l.mean_arr_gpu = cuda_make_array(0, l.n);
  519. l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
  520. }
  521. if(batch_normalize){
  522. if (l.share_layer) {
  523. l.scales_gpu = l.share_layer->scales_gpu;
  524. l.scale_updates_gpu = l.share_layer->scale_updates_gpu;
  525. l.mean_gpu = l.share_layer->mean_gpu;
  526. l.variance_gpu = l.share_layer->variance_gpu;
  527. l.rolling_mean_gpu = l.share_layer->rolling_mean_gpu;
  528. l.rolling_variance_gpu = l.share_layer->rolling_variance_gpu;
  529. l.mean_delta_gpu = l.share_layer->mean_delta_gpu;
  530. l.variance_delta_gpu = l.share_layer->variance_delta_gpu;
  531. }
  532. else {
  533. l.scales_gpu = cuda_make_array(l.scales, n);
  534. if (train) {
  535. l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
  536. l.mean_gpu = cuda_make_array(l.mean, n);
  537. l.variance_gpu = cuda_make_array(l.variance, n);
  538. #ifndef CUDNN
  539. l.mean_delta_gpu = cuda_make_array(l.mean, n);
  540. l.variance_delta_gpu = cuda_make_array(l.variance, n);
  541. #endif // CUDNN
  542. }
  543. l.rolling_mean_gpu = cuda_make_array(l.mean, n);
  544. l.rolling_variance_gpu = cuda_make_array(l.variance, n);
  545. }
  546. if (train) {
  547. l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
  548. #ifndef CUDNN
  549. l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
  550. #endif // CUDNN
  551. }
  552. }
  553. if (l.assisted_excitation)
  554. {
  555. const int size = l.out_w * l.out_h * l.batch;
  556. l.gt_gpu = cuda_make_array(NULL, size);
  557. l.a_avg_gpu = cuda_make_array(NULL, size);
  558. }
  559. #ifdef CUDNN
  560. create_convolutional_cudnn_tensors(&l);
  561. cudnn_convolutional_setup(&l, cudnn_fastest, 0);
  562. #endif // CUDNN
  563. }
  564. #endif // GPU
  565. l.workspace_size = get_convolutional_workspace_size(l);
  566. //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
  567. l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
  568. if (l.xnor) l.bflops = l.bflops / 32;
  569. if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
  570. else if (l.xnor) fprintf(stderr, "convX ");
  571. else if (l.share_layer) fprintf(stderr, "convS ");
  572. else if (l.assisted_excitation) fprintf(stderr, "convAE");
  573. else fprintf(stderr, "conv ");
  574. if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
  575. else fprintf(stderr, "%5d ", n);
  576. if (stride_x != stride_y) fprintf(stderr, "%2dx%2d/%2dx%2d ", size, size, stride_x, stride_y);
  577. else {
  578. if (dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride_x, dilation);
  579. else fprintf(stderr, "%2d x%2d/%2d ", size, size, stride_x);
  580. }
  581. fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
  582. //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
  583. if (l.antialiasing) {
  584. printf("AA: ");
  585. l.input_layer = (layer*)calloc(1, sizeof(layer));
  586. int blur_size = 3;
  587. int blur_pad = blur_size / 2;
  588. if (l.antialiasing == 2) {
  589. blur_size = 2;
  590. blur_pad = 0;
  591. }
  592. *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0, 0, train);
  593. const int blur_nweights = n * blur_size * blur_size; // (n / n) * n * blur_size * blur_size;
  594. int i;
  595. if (blur_size == 2) {
  596. for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
  597. l.input_layer->weights[i + 0] = 1 / 4.f;
  598. l.input_layer->weights[i + 1] = 1 / 4.f;
  599. l.input_layer->weights[i + 2] = 1 / 4.f;
  600. l.input_layer->weights[i + 3] = 1 / 4.f;
  601. }
  602. }
  603. else {
  604. for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
  605. l.input_layer->weights[i + 0] = 1 / 16.f;
  606. l.input_layer->weights[i + 1] = 2 / 16.f;
  607. l.input_layer->weights[i + 2] = 1 / 16.f;
  608. l.input_layer->weights[i + 3] = 2 / 16.f;
  609. l.input_layer->weights[i + 4] = 4 / 16.f;
  610. l.input_layer->weights[i + 5] = 2 / 16.f;
  611. l.input_layer->weights[i + 6] = 1 / 16.f;
  612. l.input_layer->weights[i + 7] = 2 / 16.f;
  613. l.input_layer->weights[i + 8] = 1 / 16.f;
  614. }
  615. }
  616. for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
  617. #ifdef GPU
  618. if (gpu_index >= 0) {
  619. l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
  620. push_convolutional_layer(*(l.input_layer));
  621. }
  622. #endif // GPU
  623. }
  624. return l;
  625. }
  626. void denormalize_convolutional_layer(convolutional_layer l)
  627. {
  628. int i, j;
  629. for(i = 0; i < l.n; ++i){
  630. float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
  631. for(j = 0; j < l.nweights; ++j){
  632. l.weights[i*l.nweights + j] *= scale;
  633. }
  634. l.biases[i] -= l.rolling_mean[i] * scale;
  635. l.scales[i] = 1;
  636. l.rolling_mean[i] = 0;
  637. l.rolling_variance[i] = 1;
  638. }
  639. }
  640. void test_convolutional_layer()
  641. {
  642. convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0, 0, 0);
  643. l.batch_normalize = 1;
  644. float data[] = {1,1,1,1,1,
  645. 1,1,1,1,1,
  646. 1,1,1,1,1,
  647. 1,1,1,1,1,
  648. 1,1,1,1,1,
  649. 2,2,2,2,2,
  650. 2,2,2,2,2,
  651. 2,2,2,2,2,
  652. 2,2,2,2,2,
  653. 2,2,2,2,2,
  654. 3,3,3,3,3,
  655. 3,3,3,3,3,
  656. 3,3,3,3,3,
  657. 3,3,3,3,3,
  658. 3,3,3,3,3};
  659. network_state state = {0};
  660. state.input = data;
  661. forward_convolutional_layer(l, state);
  662. }
  663. void resize_convolutional_layer(convolutional_layer *l, int w, int h)
  664. {
  665. int total_batch = l->batch*l->steps;
  666. int old_w = l->w;
  667. int old_h = l->h;
  668. l->w = w;
  669. l->h = h;
  670. int out_w = convolutional_out_width(*l);
  671. int out_h = convolutional_out_height(*l);
  672. l->out_w = out_w;
  673. l->out_h = out_h;
  674. l->outputs = l->out_h * l->out_w * l->out_c;
  675. l->inputs = l->w * l->h * l->c;
  676. l->output = (float*)xrealloc(l->output, total_batch * l->outputs * sizeof(float));
  677. if (l->train) {
  678. l->delta = (float*)xrealloc(l->delta, total_batch * l->outputs * sizeof(float));
  679. if (l->batch_normalize) {
  680. l->x = (float*)xrealloc(l->x, total_batch * l->outputs * sizeof(float));
  681. l->x_norm = (float*)xrealloc(l->x_norm, total_batch * l->outputs * sizeof(float));
  682. }
  683. }
  684. if (l->xnor) {
  685. //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
  686. }
  687. if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float));
  688. #ifdef GPU
  689. if (old_w < w || old_h < h) {
  690. if (l->train) {
  691. cuda_free(l->delta_gpu);
  692. l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
  693. }
  694. cuda_free(l->output_gpu);
  695. l->output_gpu = cuda_make_array(l->output, total_batch*l->outputs);
  696. if (l->batch_normalize) {
  697. cuda_free(l->x_gpu);
  698. l->x_gpu = cuda_make_array(l->output, total_batch*l->outputs);
  699. #ifndef CUDNN
  700. cuda_free(l->x_norm_gpu);
  701. l->x_norm_gpu = cuda_make_array(l->output, total_batch*l->outputs);
  702. #endif // CUDNN
  703. }
  704. if (l->xnor) {
  705. cuda_free(l->binary_input_gpu);
  706. l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
  707. }
  708. if (l->activation == SWISH || l->activation == MISH) {
  709. cuda_free(l->activation_input_gpu);
  710. l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
  711. }
  712. if (l->assisted_excitation)
  713. {
  714. cuda_free(l->gt_gpu);
  715. cuda_free(l->a_avg_gpu);
  716. const int size = l->out_w * l->out_h * l->batch;
  717. l->gt_gpu = cuda_make_array(NULL, size);
  718. l->a_avg_gpu = cuda_make_array(NULL, size);
  719. }
  720. }
  721. #ifdef CUDNN
  722. cudnn_convolutional_setup(l, cudnn_fastest, 0);
  723. #endif
  724. #endif
  725. l->workspace_size = get_convolutional_workspace_size(*l);
  726. #ifdef CUDNN
  727. // check for excessive memory consumption
  728. size_t free_byte;
  729. size_t total_byte;
  730. CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
  731. if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
  732. printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
  733. cudnn_convolutional_setup(l, cudnn_smallest, 0);
  734. l->workspace_size = get_convolutional_workspace_size(*l);
  735. }
  736. #endif
  737. }
  738. void set_specified_workspace_limit(convolutional_layer *l, size_t workspace_size_limit)
  739. {
  740. #ifdef CUDNN
  741. size_t free_byte;
  742. size_t total_byte;
  743. CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
  744. cudnn_convolutional_setup(l, cudnn_specify, workspace_size_limit);
  745. l->workspace_size = get_convolutional_workspace_size(*l);
  746. //printf("Set specified workspace limit for cuDNN: %zu, available: %zu, workspace = %zu \n", workspace_size_limit, free_byte, l->workspace_size);
  747. #endif // CUDNN
  748. }
  749. void add_bias(float *output, float *biases, int batch, int n, int size)
  750. {
  751. int i,j,b;
  752. for(b = 0; b < batch; ++b){
  753. for(i = 0; i < n; ++i){
  754. for(j = 0; j < size; ++j){
  755. output[(b*n + i)*size + j] += biases[i];
  756. }
  757. }
  758. }
  759. }
  760. void scale_bias(float *output, float *scales, int batch, int n, int size)
  761. {
  762. int i,j,b;
  763. for(b = 0; b < batch; ++b){
  764. for(i = 0; i < n; ++i){
  765. for(j = 0; j < size; ++j){
  766. output[(b*n + i)*size + j] *= scales[i];
  767. }
  768. }
  769. }
  770. }
  771. void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
  772. {
  773. int i,b;
  774. for(b = 0; b < batch; ++b){
  775. for(i = 0; i < n; ++i){
  776. bias_updates[i] += sum_array(delta+size*(i+b*n), size);
  777. }
  778. }
  779. }
  780. void gemm_nn_custom(int M, int N, int K, float ALPHA,
  781. float *A, int lda,
  782. float *B, int ldb,
  783. float *C, int ldc)
  784. {
  785. int i, j, k;
  786. for (i = 0; i < M; ++i) {
  787. for (k = 0; k < K; ++k) {
  788. PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
  789. //printf("\n weight = %f \n", A_PART);
  790. for (j = 0; j < N; ++j) {
  791. C[i*ldc + j] += A_PART*B[k*ldb + j];
  792. }
  793. }
  794. }
  795. }
  796. void get_mean_array(float *src, size_t size, size_t filters, float *mean_arr) {
  797. size_t i, counter;
  798. counter = 0;
  799. for (i = 0; i < size; i += size / filters) {
  800. mean_arr[counter++] = fabs(src[i]);
  801. }
  802. }
  803. /*
  804. void float_to_bit(float *src, unsigned char *dst, size_t size) {
  805. size_t dst_size = size / 8 + 1;
  806. memset(dst, 0, dst_size);
  807. size_t i, dst_i, dst_shift;
  808. for (i = 0; i < size; ++i) {
  809. if (src[i] > 0) set_bit(dst, i);
  810. }
  811. }
  812. */
  813. void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, float *mean_arr) {
  814. memset(dst, 0, size *sizeof(float));
  815. size_t i;
  816. for (i = 0; i < size; ++i) {
  817. float mean_val = 1;
  818. if(mean_arr != NULL) mean_val = fabs(mean_arr[i / (size / filters)]);
  819. if(get_bit(src, i)) dst[i] = mean_val;
  820. else dst[i] = -mean_val;
  821. }
  822. }
  823. void binary_align_weights(convolutional_layer *l)
  824. {
  825. int m = l->n; // (l->n / l->groups)
  826. int k = l->size*l->size*l->c; // ->size*l->size*(l->c / l->groups)
  827. size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8;
  828. l->new_lda = new_lda;
  829. binarize_weights(l->weights, m, k, l->binary_weights);
  830. size_t align_weights_size = new_lda * m;
  831. l->align_bit_weights_size = align_weights_size / 8 + 1;
  832. float* align_weights = (float*)xcalloc(align_weights_size, sizeof(float));
  833. l->align_bit_weights = (char*)xcalloc(l->align_bit_weights_size, sizeof(char));
  834. size_t i, j;
  835. // align A without transpose
  836. for (i = 0; i < m; ++i) {
  837. for (j = 0; j < k; ++j) {
  838. align_weights[i*new_lda + j] = l->binary_weights[i*k + j];
  839. }
  840. }
  841. if (l->c % 32 == 0)
  842. //if(gpu_index < 0 && l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
  843. //if (l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
  844. {
  845. int fil, chan;
  846. const int items_per_filter = l->c * l->size * l->size;
  847. //const int dst_items_per_filter = new_lda;
  848. for (fil = 0; fil < l->n; ++fil)
  849. {
  850. for (chan = 0; chan < l->c; chan += 32)
  851. {
  852. const int items_per_channel = l->size*l->size;
  853. for (i = 0; i < items_per_channel; ++i)
  854. {
  855. //uint32_t val = 0;
  856. int c_pack;
  857. for (c_pack = 0; c_pack < 32; ++c_pack) {
  858. float src = l->binary_weights[fil*items_per_filter + (chan + c_pack)*items_per_channel + i];
  859. //align_weights[fil*items_per_filter + chan*items_per_channel + i * 32 + c_pack] = src;
  860. align_weights[fil*new_lda + chan*items_per_channel + i*32 + c_pack] = src;
  861. //val |= (src << c);
  862. }
  863. }
  864. }
  865. }
  866. //printf("\n l.index = %d \t aw[0] = %f, aw[1] = %f, aw[2] = %f, aw[3] = %f \n", l->index, align_weights[0], align_weights[1], align_weights[2], align_weights[3]);
  867. //memcpy(l->binary_weights, align_weights, (l->size * l->size * l->c * l->n) * sizeof(float));
  868. float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
  869. //if (l->n >= 32)
  870. if(gpu_index >= 0)
  871. {
  872. //int M = l->n;
  873. //int N = l->out_w*l->out_h;
  874. //printf("\n M = %d, N = %d, M %% 8 = %d, N %% 8 = %d - weights \n", M, N, M % 8, N % 8);
  875. //printf("\n l.w = %d, l.c = %d, l.n = %d \n", l->w, l->c, l->n);
  876. for (i = 0; i < align_weights_size / 8; ++i) l->align_bit_weights[i] = ~(l->align_bit_weights[i]);
  877. }
  878. get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
  879. //get_mean_array(l->binary_weights, m*new_lda, l->n, l->mean_arr);
  880. }
  881. else {
  882. float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
  883. get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
  884. }
  885. //l->mean_arr = calloc(l->n, sizeof(float));
  886. //get_mean_array(align_weights, align_weights_size, l->n, l->mean_arr);
  887. #ifdef GPU
  888. cudaError_t status;
  889. l->align_workspace_size = l->bit_align * l->size * l->size * l->c;
  890. status = cudaMalloc((void **)&l->align_workspace_gpu, l->align_workspace_size * sizeof(float));
  891. status = cudaMalloc((void **)&l->transposed_align_workspace_gpu, l->align_workspace_size * sizeof(float));
  892. CHECK_CUDA(status);
  893. //l->align_bit_weights_gpu = cuda_make_array(l->align_bit_weights, l->align_bit_weights_size * sizeof(char)/sizeof(float));
  894. status = cudaMalloc((void **)&l->align_bit_weights_gpu, l->align_bit_weights_size);
  895. CHECK_CUDA(status);
  896. status = cudaMemcpy(l->align_bit_weights_gpu, l->align_bit_weights, l->align_bit_weights_size, cudaMemcpyHostToDevice);
  897. CHECK_CUDA(status);
  898. status = cudaMemcpy(l->binary_weights_gpu, l->binary_weights, m*k * sizeof(float), cudaMemcpyHostToDevice);
  899. CHECK_CUDA(status);
  900. //l->mean_arr_gpu = cuda_make_array(l->mean_arr, l->n);
  901. cuda_push_array(l->mean_arr_gpu, l->mean_arr, l->n);
  902. CHECK_CUDA(cudaDeviceSynchronize());
  903. #endif // GPU
  904. free(align_weights);
  905. }
  906. // binary transpose
  907. size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input, size_t ldb_align, int bit_align)
  908. {
  909. size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
  910. //printf("\n n = %d, bit_align = %d \n", n, bit_align);
  911. size_t t_intput_size = new_ldb * bit_align;// n;
  912. size_t t_bit_input_size = t_intput_size / 8;// +1;
  913. memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
  914. //int src_size = k * bit_align;
  915. // b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
  916. // t_input - [bit_align, k] - [n', k]
  917. // t_bit_input - [new_ldb, n] - [k', n]
  918. //transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
  919. transpose_bin((uint32_t*)b, (uint32_t*)*t_bit_input, k, n, bit_align, new_ldb, 8);
  920. return t_intput_size;
  921. }
  922. void forward_convolutional_layer(convolutional_layer l, network_state state)
  923. {
  924. int out_h = convolutional_out_height(l);
  925. int out_w = convolutional_out_width(l);
  926. int i, j;
  927. fill_cpu(l.outputs*l.batch, 0, l.output, 1);
  928. if (l.xnor && (!l.align_bit_weights || state.train)) {
  929. if (!l.align_bit_weights || state.train) {
  930. binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);
  931. //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
  932. }
  933. swap_binary(&l);
  934. binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
  935. state.input = l.binary_input;
  936. }
  937. int m = l.n / l.groups;
  938. int k = l.size*l.size*l.c / l.groups;
  939. int n = out_h*out_w;
  940. static int u = 0;
  941. u++;
  942. for(i = 0; i < l.batch; ++i)
  943. {
  944. for (j = 0; j < l.groups; ++j)
  945. {
  946. float *a = l.weights +j*l.nweights / l.groups;
  947. float *b = state.workspace;
  948. float *c = l.output +(i*l.groups + j)*n*m;
  949. //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
  950. //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
  951. if (l.xnor && l.align_bit_weights && !state.train && l.stride_x == l.stride_y)
  952. {
  953. memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
  954. if (l.c % 32 == 0)
  955. {
  956. //printf(" l.index = %d - new XNOR \n", l.index);
  957. int ldb_align = l.lda_align;
  958. size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
  959. //size_t t_intput_size = new_ldb * l.bit_align;// n;
  960. //size_t t_bit_input_size = t_intput_size / 8;// +1;
  961. int re_packed_input_size = l.c * l.w * l.h;
  962. memset(state.workspace, 0, re_packed_input_size * sizeof(float));
  963. const size_t new_c = l.c / 32;
  964. size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
  965. memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
  966. //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
  967. //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
  968. // float32x4 by channel (as in cuDNN)
  969. repack_input(state.input, state.workspace, l.w, l.h, l.c);
  970. // 32 x floats -> 1 x uint32_t
  971. float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
  972. //free(re_packed_input);
  973. // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
  974. //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
  975. // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
  976. // // then exit from if()
  977. im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
  978. //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
  979. //free(bin_re_packed_input);
  980. int new_k = l.size*l.size*l.c / 32;
  981. // good for (l.c == 64)
  982. //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
  983. // l.align_bit_weights, l.new_lda/32,
  984. // b, n,
  985. // c, n, l.mean_arr);
  986. // // then exit from if()
  987. transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
  988. // the main GEMM function
  989. gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
  990. // // alternative GEMM
  991. //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
  992. // l.align_bit_weights, l.new_lda/32,
  993. // t_bit_input, new_ldb / 32,
  994. // c, n, l.mean_arr);
  995. //free(t_bit_input);
  996. }
  997. else
  998. { // else (l.c % 32 != 0)
  999. //--------------------------------------------------------
  1000. //printf(" l.index = %d - old XNOR \n", l.index);
  1001. //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align);
  1002. im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
  1003. //size_t output_size = l.outputs;
  1004. //float *count_output = calloc(output_size, sizeof(float));
  1005. //size_t bit_output_size = output_size / 8 + 1;
  1006. //char *bit_output = calloc(bit_output_size, sizeof(char));
  1007. //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col()
  1008. //size_t bit_input_size = intput_size / 8 + 1;
  1009. //char *bit_input = calloc(bit_input_size, sizeof(char));
  1010. //size_t weights_size = k * m; //l.size*l.size*l.c*l.n; // l.nweights
  1011. //size_t bit_weights_size = weights_size / 8 + 1;
  1012. //char *bit_weights = calloc(bit_weights_size, sizeof(char));
  1013. //float *mean_arr = calloc(l.n, sizeof(float));
  1014. // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
  1015. {
  1016. //size_t ldb_align = 256; // 256 bit for AVX2
  1017. int ldb_align = l.lda_align;
  1018. size_t new_ldb = k + (ldb_align - k%ldb_align);
  1019. size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
  1020. // 5x times faster than gemm()-float32
  1021. gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
  1022. //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
  1023. //free(t_input);
  1024. //free(t_bit_input);
  1025. //}
  1026. }
  1027. }
  1028. add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
  1029. //activate_array(l.output, m*n*l.batch, l.activation);
  1030. if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
  1031. else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
  1032. else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
  1033. else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
  1034. else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
  1035. return;
  1036. }
  1037. else {
  1038. //printf(" l.index = %d - FP32 \n", l.index);
  1039. float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
  1040. if (l.size == 1) {
  1041. b = im;
  1042. }
  1043. else {
  1044. //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
  1045. im2col_cpu_ext(im, // input
  1046. l.c / l.groups, // input channels
  1047. l.h, l.w, // input size (h, w)
  1048. l.size, l.size, // kernel size (h, w)
  1049. l.pad, l.pad, // padding (h, w)
  1050. l.stride_y, l.stride_x, // stride (h, w)
  1051. l.dilation, l.dilation, // dilation (h, w)
  1052. b); // output
  1053. }
  1054. gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
  1055. // bit-count to float
  1056. }
  1057. //c += n*m;
  1058. //state.input += l.c*l.h*l.w;
  1059. }
  1060. }
  1061. if(l.batch_normalize){
  1062. forward_batchnorm_layer(l, state);
  1063. }
  1064. else {
  1065. add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
  1066. }
  1067. //activate_array(l.output, m*n*l.batch, l.activation);
  1068. if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
  1069. else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
  1070. else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
  1071. else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
  1072. else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
  1073. if(l.binary || l.xnor) swap_binary(&l);
  1074. //visualize_convolutional_layer(l, "conv_visual", NULL);
  1075. //wait_until_press_key_cv();
  1076. if(l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
  1077. if (l.antialiasing) {
  1078. network_state s = { 0 };
  1079. s.train = state.train;
  1080. s.workspace = state.workspace;
  1081. s.net = state.net;
  1082. s.input = l.output;
  1083. forward_convolutional_layer(*(l.input_layer), s);
  1084. //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
  1085. memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
  1086. }
  1087. }
  1088. void assisted_excitation_forward(convolutional_layer l, network_state state)
  1089. {
  1090. const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
  1091. // epoch
  1092. //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
  1093. // calculate alpha
  1094. //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
  1095. //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
  1096. float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
  1097. if (l.assisted_excitation > 1) {
  1098. if (iteration_num > l.assisted_excitation) alpha = 0;
  1099. else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
  1100. }
  1101. //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
  1102. // epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
  1103. float *a_avg = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
  1104. float *g = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
  1105. int b;
  1106. int w, h, c;
  1107. l.max_boxes = state.net.num_boxes;
  1108. l.truths = l.max_boxes*(4 + 1);
  1109. for (b = 0; b < l.batch; ++b)
  1110. {
  1111. // calculate G
  1112. int t;
  1113. for (t = 0; t < state.net.num_boxes; ++t) {
  1114. box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
  1115. if (!truth.x) break; // continue;
  1116. int left = floor((truth.x - truth.w / 2) * l.out_w);
  1117. int right = ceil((truth.x + truth.w / 2) * l.out_w);
  1118. int top = floor((truth.y - truth.h / 2) * l.out_h);
  1119. int bottom = ceil((truth.y + truth.h / 2) * l.out_h);
  1120. for (w = left; w <= right; w++) {
  1121. for (h = top; h < bottom; h++) {
  1122. g[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
  1123. }
  1124. }
  1125. }
  1126. }
  1127. for (b = 0; b < l.batch; ++b)
  1128. {
  1129. // calculate average A
  1130. for (w = 0; w < l.out_w; w++) {
  1131. for (h = 0; h < l.out_h; h++) {
  1132. for (c = 0; c < l.out_c; c++) {
  1133. a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
  1134. }
  1135. a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c; // a_avg / d
  1136. }
  1137. }
  1138. }
  1139. // change activation
  1140. for (b = 0; b < l.batch; ++b)
  1141. {
  1142. for (w = 0; w < l.out_w; w++) {
  1143. for (h = 0; h < l.out_h; h++) {
  1144. for (c = 0; c < l.out_c; c++)
  1145. {
  1146. // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
  1147. l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
  1148. alpha *
  1149. g[w + l.out_w*(h + l.out_h*b)] *
  1150. a_avg[w + l.out_w*(h + l.out_h*b)];
  1151. //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
  1152. // alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
  1153. }
  1154. }
  1155. }
  1156. }
  1157. if(0) // visualize ground truth
  1158. {
  1159. #ifdef OPENCV
  1160. for (b = 0; b < l.batch; ++b)
  1161. {
  1162. image img = float_to_image(l.out_w, l.out_h, 1, &g[l.out_w*l.out_h*b]);
  1163. char buff[100];
  1164. sprintf(buff, "a_excitation_%d", b);
  1165. show_image_cv(img, buff);
  1166. image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
  1167. char buff2[100];
  1168. sprintf(buff2, "a_excitation_act_%d", b);
  1169. show_image_cv(img2, buff2);
  1170. wait_key_cv(5);
  1171. }
  1172. wait_until_press_key_cv();
  1173. #endif // OPENCV
  1174. }
  1175. free(g);
  1176. free(a_avg);
  1177. }
  1178. void backward_convolutional_layer(convolutional_layer l, network_state state)
  1179. {
  1180. int i, j;
  1181. int m = l.n / l.groups;
  1182. int n = l.size*l.size*l.c / l.groups;
  1183. int k = l.out_w*l.out_h;
  1184. if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
  1185. else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
  1186. else if (l.activation == NORM_CHAN_SOFTMAX) gradient_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
  1187. else if (l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
  1188. else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
  1189. if (l.batch_normalize) {
  1190. backward_batchnorm_layer(l, state);
  1191. }
  1192. else {
  1193. backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
  1194. }
  1195. for (i = 0; i < l.batch; ++i) {
  1196. for (j = 0; j < l.groups; ++j) {
  1197. float *a = l.delta + (i*l.groups + j)*m*k;
  1198. float *b = state.workspace;
  1199. float *c = l.weight_updates + j*l.nweights / l.groups;
  1200. float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
  1201. //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
  1202. im2col_cpu_ext(
  1203. im, // input
  1204. l.c / l.groups, // input channels
  1205. l.h, l.w, // input size (h, w)
  1206. l.size, l.size, // kernel size (h, w)
  1207. l.pad, l.pad, // padding (h, w)
  1208. l.stride_y, l.stride_x, // stride (h, w)
  1209. l.dilation, l.dilation, // dilation (h, w)
  1210. b); // output
  1211. gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
  1212. if (state.delta) {
  1213. a = l.weights + j*l.nweights / l.groups;
  1214. b = l.delta + (i*l.groups + j)*m*k;
  1215. c = state.workspace;
  1216. gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
  1217. //col2im_cpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride,
  1218. // l.pad, state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w);
  1219. col2im_cpu_ext(
  1220. state.workspace, // input
  1221. l.c / l.groups, // input channels (h, w)
  1222. l.h, l.w, // input size (h, w)
  1223. l.size, l.size, // kernel size (h, w)
  1224. l.pad, l.pad, // padding (h, w)
  1225. l.stride_y, l.stride_x, // stride (h, w)
  1226. l.dilation, l.dilation, // dilation (h, w)
  1227. state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
  1228. }
  1229. }
  1230. }
  1231. }
  1232. void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate_init, float momentum, float decay)
  1233. {
  1234. float learning_rate = learning_rate_init*l.learning_rate_scale;
  1235. //float momentum = a.momentum;
  1236. //float decay = a.decay;
  1237. //int batch = a.batch;
  1238. axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
  1239. axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
  1240. scal_cpu(l.nweights, momentum, l.weight_updates, 1);
  1241. axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
  1242. scal_cpu(l.n, momentum, l.bias_updates, 1);
  1243. if (l.scales) {
  1244. axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
  1245. scal_cpu(l.n, momentum, l.scale_updates, 1);
  1246. }
  1247. }
  1248. image get_convolutional_weight(convolutional_layer l, int i)
  1249. {
  1250. int h = l.size;
  1251. int w = l.size;
  1252. int c = l.c / l.groups;
  1253. return float_to_image(w, h, c, l.weights + i*h*w*c);
  1254. }
  1255. void rgbgr_weights(convolutional_layer l)
  1256. {
  1257. int i;
  1258. for (i = 0; i < l.n; ++i) {
  1259. image im = get_convolutional_weight(l, i);
  1260. if (im.c == 3) {
  1261. rgbgr_image(im);
  1262. }
  1263. }
  1264. }
  1265. void rescale_weights(convolutional_layer l, float scale, float trans)
  1266. {
  1267. int i;
  1268. for (i = 0; i < l.n; ++i) {
  1269. image im = get_convolutional_weight(l, i);
  1270. if (im.c == 3) {
  1271. scale_image(im, scale);
  1272. float sum = sum_array(im.data, im.w*im.h*im.c);
  1273. l.biases[i] += sum*trans;
  1274. }
  1275. }
  1276. }
  1277. image *get_weights(convolutional_layer l)
  1278. {
  1279. image *weights = (image *)xcalloc(l.n, sizeof(image));
  1280. int i;
  1281. for (i = 0; i < l.n; ++i) {
  1282. weights[i] = copy_image(get_convolutional_weight(l, i));
  1283. normalize_image(weights[i]);
  1284. /*
  1285. char buff[256];
  1286. sprintf(buff, "filter%d", i);
  1287. save_image(weights[i], buff);
  1288. */
  1289. }
  1290. //error("hey");
  1291. return weights;
  1292. }
  1293. image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
  1294. {
  1295. image *single_weights = get_weights(l);
  1296. show_images(single_weights, l.n, window);
  1297. image delta = get_convolutional_image(l);
  1298. image dc = collapse_image_layers(delta, 1);
  1299. char buff[256];
  1300. sprintf(buff, "%s: Output", window);
  1301. show_image(dc, buff);
  1302. //save_image(dc, buff);
  1303. free_image(dc);
  1304. return single_weights;
  1305. }