123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561 |
- #include "dark_cuda.h"
- #include <stdio.h>
- #include <time.h>
- #include <assert.h>
- #include "network.h"
- #include "image.h"
- #include "data.h"
- #include "utils.h"
- #include "parser.h"
- #include "crop_layer.h"
- #include "connected_layer.h"
- #include "rnn_layer.h"
- #include "gru_layer.h"
- #include "crnn_layer.h"
- #include "detection_layer.h"
- #include "region_layer.h"
- #include "convolutional_layer.h"
- #include "activation_layer.h"
- #include "maxpool_layer.h"
- #include "reorg_layer.h"
- #include "avgpool_layer.h"
- #include "normalization_layer.h"
- #include "batchnorm_layer.h"
- #include "cost_layer.h"
- #include "local_layer.h"
- #include "softmax_layer.h"
- #include "dropout_layer.h"
- #include "route_layer.h"
- #include "shortcut_layer.h"
- #include "blas.h"
- //#ifdef OPENCV
- //#include <opencv2/highgui/highgui_c.h>
- //#endif
- #include "http_stream.h"
- float * get_network_output_gpu_layer(network net, int i);
- float * get_network_delta_gpu_layer(network net, int i);
- float * get_network_output_gpu(network net);
- typedef struct time_benchmark_layers {
- float time;
- int layer_id, layer_type;
- } time_benchmark_layers;
- int time_comparator(const void *pa, const void *pb)
- {
- time_benchmark_layers a = *(time_benchmark_layers *)pa;
- time_benchmark_layers b = *(time_benchmark_layers *)pb;
- float diff = a.time - b.time;
- if (diff < 0) return 1;
- else if (diff > 0) return -1;
- return 0;
- }
- void forward_network_gpu(network net, network_state state)
- {
- static time_benchmark_layers *avg_time_per_layer = NULL;
- static time_benchmark_layers *sorted_avg_time_per_layer = NULL;
- double start_time, end_time;
- if (net.benchmark_layers) {
- if (!avg_time_per_layer) {
- avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
- sorted_avg_time_per_layer = (time_benchmark_layers *)calloc(net.n, sizeof(time_benchmark_layers));
- }
- cudaDeviceSynchronize();
- }
- //printf("\n");
- state.workspace = net.workspace;
- int i;
- for(i = 0; i < net.n; ++i){
- state.index = i;
- layer l = net.layers[i];
- if(l.delta_gpu && state.train){
- fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
- }
- if (net.benchmark_layers) {
- start_time = get_time_point();
- }
- l.forward_gpu(l, state);
- if (net.benchmark_layers) {
- CHECK_CUDA(cudaDeviceSynchronize());
- end_time = get_time_point();
- const double took_time = (end_time - start_time) / 1000;
- const double alpha = 0.9;
- if (avg_time_per_layer[i].time == 0) {
- avg_time_per_layer[i].layer_id = i;
- avg_time_per_layer[i].layer_type = l.type;
- avg_time_per_layer[i].time = took_time;
- }
- else avg_time_per_layer[i].time = avg_time_per_layer[i].time * alpha + took_time * (1 - alpha);
- sorted_avg_time_per_layer[i] = avg_time_per_layer[i];
- printf("\n layer %d - type: %d - %lf ms - avg_time %lf ms \n", i, l.type, took_time, avg_time_per_layer[i].time);
- }
- if(net.wait_stream)
- cudaStreamSynchronize(get_cuda_stream());
- state.input = l.output_gpu;
- //cudaDeviceSynchronize();
- /*
- cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
- if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
- int j;
- for (j = 0; j < l.out_c; ++j) {
- image img = make_image(l.out_w, l.out_h, 3);
- memcpy(img.data, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
- memcpy(img.data + l.out_w*l.out_h * 1, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
- memcpy(img.data + l.out_w*l.out_h * 2, l.output + l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
- char buff[256];
- sprintf(buff, "layer-%d slice-%d", i, j);
- show_image(img, buff);
- save_image(img, buff);
- }
- cvWaitKey(0); // wait press-key in console
- cvDestroyAllWindows();
- }
- */
- }
- if (net.benchmark_layers) {
- printf("\n\nSorted by time:\n");
- qsort(sorted_avg_time_per_layer, net.n, sizeof(time_benchmark_layers), time_comparator);
- for (i = 0; i < net.n; ++i) {
- //printf("layer %d - type: %d - avg_time %lf ms \n", avg_time_per_layer[i].layer_id, avg_time_per_layer[i].layer_type, avg_time_per_layer[i].time);
- printf("%d - layer %d - type: %d - avg_time %lf ms \n", i, sorted_avg_time_per_layer[i].layer_id, sorted_avg_time_per_layer[i].layer_type, sorted_avg_time_per_layer[i].time);
- }
- }
- //cudaStreamSynchronize(get_cuda_stream()); // sync CUDA-functions
- //cudaDeviceSynchronize();
- }
- void backward_network_gpu(network net, network_state state)
- {
- state.workspace = net.workspace;
- int i;
- float * original_input = state.input;
- float * original_delta = state.delta;
- for(i = net.n-1; i >= 0; --i){
- state.index = i;
- layer l = net.layers[i];
- if (l.stopbackward) break;
- if(i == 0){
- state.input = original_input;
- state.delta = original_delta;
- }else{
- layer prev = net.layers[i-1];
- state.input = prev.output_gpu;
- state.delta = prev.delta_gpu;
- if (net.optimized_memory && !prev.keep_delta_gpu) {
- state.delta = net.state_delta_gpu;
- }
- }
- if (l.onlyforward) continue;
- l.backward_gpu(l, state);
- if (i != 0) {
- layer prev = net.layers[i - 1];
- if (net.optimized_memory && state.delta && !prev.keep_delta_gpu) {
- if (prev.delta_gpu != state.delta) simple_copy_ongpu(prev.outputs*prev.batch, state.delta, prev.delta_gpu);
- fill_ongpu(prev.outputs*prev.batch, 0, net.state_delta_gpu, 1);
- }
- }
- /*
- if(i != 0)
- {
- layer l = net.layers[i - 1];
- int state_delta_nan_inf = is_nan_or_inf(state.delta, l.outputs * l.batch);
- int state_input_nan_inf = is_nan_or_inf(state.input, l.outputs * l.batch);
- printf("\n i - %d is_nan_or_inf(s.delta) = %d \n", i, state_delta_nan_inf);
- printf(" i - %d is_nan_or_inf(s.input) = %d \n", i, state_input_nan_inf);
- if (state_delta_nan_inf || state_input_nan_inf) { printf(" found "); getchar(); }
- }
- */
- }
- }
- void update_network_gpu(network net)
- {
- cuda_set_device(net.gpu_index);
- const int iteration_num = (*net.seen) / (net.batch * net.subdivisions);
- int i;
- int update_batch = net.batch*net.subdivisions * get_sequence_value(net);
- float rate = get_current_rate(net);
- for(i = 0; i < net.n; ++i){
- layer l = net.layers[i];
- l.t = get_current_batch(net);
- if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0;
- if(l.update_gpu){
- l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
- }
- }
- }
- void forward_backward_network_gpu(network net, float *x, float *y)
- {
- network_state state;
- state.index = 0;
- state.net = net;
- int x_size = get_network_input_size(net)*net.batch;
- int y_size = get_network_output_size(net)*net.batch;
- if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch;
- if(!*net.input_gpu){
- *net.input_gpu = cuda_make_array(x, x_size);
- *net.truth_gpu = cuda_make_array(y, y_size);
- }else{
- cuda_push_array(*net.input_gpu, x, x_size);
- cuda_push_array(*net.truth_gpu, y, y_size);
- }
- state.input = *net.input_gpu;
- state.delta = 0;
- state.truth = *net.truth_gpu;
- state.train = 1;
- #if defined(CUDNN_HALF) && defined(CUDNN)
- int i;
- for (i = 0; i < net.n; ++i) {
- layer l = net.layers[i];
- if (net.cudnn_half){
- if (l.type == CONVOLUTIONAL && l.weights_gpu && l.weights_gpu16) {
- assert((l.nweights) > 0);
- cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
- }
- else if (l.type == CRNN && l.input_layer->weights_gpu && l.input_layer->weights_gpu16) {
- assert((l.input_layer->c*l.input_layer->n*l.input_layer->size*l.input_layer->size) > 0);
- cuda_convert_f32_to_f16(l.input_layer->weights_gpu, l.input_layer->nweights, l.input_layer->weights_gpu16);
- cuda_convert_f32_to_f16(l.self_layer->weights_gpu, l.self_layer->nweights, l.self_layer->weights_gpu16);
- cuda_convert_f32_to_f16(l.output_layer->weights_gpu, l.output_layer->nweights, l.output_layer->weights_gpu16);
- }
- else if (l.type == CONV_LSTM && l.wf->weights_gpu && l.wf->weights_gpu16) {
- assert((l.wf->c * l.wf->n * l.wf->size * l.wf->size) > 0);
- if (l.peephole) {
- cuda_convert_f32_to_f16(l.vf->weights_gpu, l.vf->nweights, l.vf->weights_gpu16);
- cuda_convert_f32_to_f16(l.vi->weights_gpu, l.vi->nweights, l.vi->weights_gpu16);
- cuda_convert_f32_to_f16(l.vo->weights_gpu, l.vo->nweights, l.vo->weights_gpu16);
- }
- cuda_convert_f32_to_f16(l.wf->weights_gpu, l.wf->nweights, l.wf->weights_gpu16);
- cuda_convert_f32_to_f16(l.wi->weights_gpu, l.wi->nweights, l.wi->weights_gpu16);
- cuda_convert_f32_to_f16(l.wg->weights_gpu, l.wg->nweights, l.wg->weights_gpu16);
- cuda_convert_f32_to_f16(l.wo->weights_gpu, l.wo->nweights, l.wo->weights_gpu16);
- cuda_convert_f32_to_f16(l.uf->weights_gpu, l.uf->nweights, l.uf->weights_gpu16);
- cuda_convert_f32_to_f16(l.ui->weights_gpu, l.ui->nweights, l.ui->weights_gpu16);
- cuda_convert_f32_to_f16(l.ug->weights_gpu, l.ug->nweights, l.ug->weights_gpu16);
- cuda_convert_f32_to_f16(l.uo->weights_gpu, l.uo->nweights, l.uo->weights_gpu16);
- }
- }
- }
- #endif
- forward_network_gpu(net, state);
- //cudaStreamSynchronize(get_cuda_stream());
- backward_network_gpu(net, state);
- }
- float train_network_datum_gpu(network net, float *x, float *y)
- {
- *net.seen += net.batch;
- forward_backward_network_gpu(net, x, y);
- float error = get_network_cost(net);
- //if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
- const int sequence = get_sequence_value(net);
- if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net);
- return error;
- }
- typedef struct {
- network net;
- data d;
- float *err;
- } train_args;
- void *train_thread(void *ptr)
- {
- train_args args = *(train_args*)ptr;
- free(ptr);
- cuda_set_device(args.net.gpu_index);
- *args.err = train_network(args.net, args.d);
- return 0;
- }
- pthread_t train_network_in_thread(network net, data d, float *err)
- {
- pthread_t thread;
- train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
- ptr->net = net;
- ptr->d = d;
- ptr->err = err;
- if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
- return thread;
- }
- void pull_updates(layer l)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
- cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
- if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
- } else if(l.type == CONNECTED){
- cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
- cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
- }
- }
- void push_updates(layer l)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
- cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
- if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
- } else if(l.type == CONNECTED){
- cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
- cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
- }
- }
- void update_layer(layer l, network net)
- {
- int update_batch = net.batch*net.subdivisions;
- float rate = get_current_rate(net);
- l.t = get_current_batch(net);
- if(l.update_gpu){
- l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
- }
- }
- void merge_weights(layer l, layer base)
- {
- if (l.type == CONVOLUTIONAL) {
- axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1);
- axpy_cpu(l.nweights, 1, l.weights, 1, base.weights, 1);
- if (l.scales) {
- axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1);
- }
- } else if(l.type == CONNECTED) {
- axpy_cpu(l.outputs, 1, l.biases, 1, base.biases, 1);
- axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, base.weights, 1);
- }
- }
- void scale_weights(layer l, float s)
- {
- if (l.type == CONVOLUTIONAL) {
- scal_cpu(l.n, s, l.biases, 1);
- scal_cpu(l.nweights, s, l.weights, 1);
- if (l.scales) {
- scal_cpu(l.n, s, l.scales, 1);
- }
- } else if(l.type == CONNECTED) {
- scal_cpu(l.outputs, s, l.biases, 1);
- scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
- }
- }
- void pull_weights(layer l)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_pull_array(l.biases_gpu, l.biases, l.n);
- cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
- if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n);
- } else if(l.type == CONNECTED){
- cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
- cuda_pull_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
- }
- }
- void push_weights(layer l)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_push_array(l.biases_gpu, l.biases, l.n);
- cuda_push_array(l.weights_gpu, l.weights, l.nweights);
- if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
- } else if(l.type == CONNECTED){
- cuda_push_array(l.biases_gpu, l.biases, l.outputs);
- cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
- }
- }
- void distribute_weights(layer l, layer base)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_push_array(l.biases_gpu, base.biases, l.n);
- cuda_push_array(l.weights_gpu, base.weights, l.nweights);
- if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
- } else if(l.type == CONNECTED){
- cuda_push_array(l.biases_gpu, base.biases, l.outputs);
- cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
- }
- }
- void merge_updates(layer l, layer base)
- {
- if (l.type == CONVOLUTIONAL) {
- axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
- axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1);
- if (l.scale_updates) {
- axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
- }
- } else if(l.type == CONNECTED) {
- axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
- axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
- }
- }
- void distribute_updates(layer l, layer base)
- {
- if(l.type == CONVOLUTIONAL){
- cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
- cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights);
- if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
- } else if(l.type == CONNECTED){
- cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
- cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
- }
- }
- void sync_layer(network *nets, int n, int j)
- {
- //printf("Syncing layer %d\n", j);
- int i;
- network net = nets[0];
- layer base = net.layers[j];
- cuda_set_device(net.gpu_index);
- pull_weights(base);
- for (i = 1; i < n; ++i) {
- cuda_set_device(nets[i].gpu_index);
- layer l = nets[i].layers[j];
- pull_weights(l);
- merge_weights(l, base);
- }
- scale_weights(base, 1./n);
- for (i = 0; i < n; ++i) {
- cuda_set_device(nets[i].gpu_index);
- layer l = nets[i].layers[j];
- distribute_weights(l, base);
- }
- //printf("Done syncing layer %d\n", j);
- }
- typedef struct{
- network *nets;
- int n;
- int j;
- } sync_args;
- void *sync_layer_thread(void *ptr)
- {
- sync_args args = *(sync_args*)ptr;
- sync_layer(args.nets, args.n, args.j);
- free(ptr);
- return 0;
- }
- pthread_t sync_layer_in_thread(network *nets, int n, int j)
- {
- pthread_t thread;
- sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
- ptr->nets = nets;
- ptr->n = n;
- ptr->j = j;
- if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed");
- return thread;
- }
- void sync_nets(network *nets, int n, int interval)
- {
- int j;
- int layers = nets[0].n;
- pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
- *nets[0].seen += interval * (n-1) * nets[0].batch * nets[0].subdivisions;
- for (j = 0; j < n; ++j){
- *nets[j].seen = *nets[0].seen;
- }
- for (j = 0; j < layers; ++j) {
- threads[j] = sync_layer_in_thread(nets, n, j);
- }
- for (j = 0; j < layers; ++j) {
- pthread_join(threads[j], 0);
- }
- free(threads);
- }
- float train_networks(network *nets, int n, data d, int interval)
- {
- int i;
- #ifdef _DEBUG
- int batch = nets[0].batch;
- int subdivisions = nets[0].subdivisions;
- assert(batch * subdivisions * n == d.X.rows);
- #endif
- pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
- float *errors = (float *) calloc(n, sizeof(float));
- float sum = 0;
- for(i = 0; i < n; ++i){
- data p = get_data_part(d, i, n);
- threads[i] = train_network_in_thread(nets[i], p, errors + i);
- }
- for(i = 0; i < n; ++i){
- pthread_join(threads[i], 0);
- //printf("%f\n", errors[i]);
- sum += errors[i];
- }
- //cudaDeviceSynchronize();
- if (get_current_batch(nets[0]) % interval == 0) {
- printf("Syncing... ");
- fflush(stdout);
- sync_nets(nets, n, interval);
- printf("Done!\n");
- }
- //cudaDeviceSynchronize();
- free(threads);
- free(errors);
- return (float)sum/(n);
- }
- float *get_network_output_layer_gpu(network net, int i)
- {
- layer l = net.layers[i];
- if(l.type != REGION) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
- return l.output;
- }
- float *get_network_output_gpu(network net)
- {
- int i;
- for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
- return get_network_output_layer_gpu(net, i);
- }
- float *network_predict_gpu(network net, float *input)
- {
- if (net.gpu_index != cuda_get_device())
- cuda_set_device(net.gpu_index);
- int size = get_network_input_size(net) * net.batch;
- network_state state;
- state.index = 0;
- state.net = net;
- //state.input = cuda_make_array(input, size); // memory will be allocated in the parse_network_cfg_custom()
- state.input = net.input_state_gpu;
- memcpy(net.input_pinned_cpu, input, size * sizeof(float));
- cuda_push_array(state.input, net.input_pinned_cpu, size);
- state.truth = 0;
- state.train = 0;
- state.delta = 0;
- forward_network_gpu(net, state);
- float *out = get_network_output_gpu(net);
- //cuda_free(state.input); // will be freed in the free_network()
- return out;
- }
|