123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392 |
- //usr/bin/cc -Ofast -lm "${0}" -o "${0%.c}" && ./"${0%.c}" "$@"; s=$?; rm ./"${0%.c}"; exit $s
- #include <math.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <time.h>
- typedef struct matrix{
- int rows, cols;
- double **vals;
- } matrix;
- matrix csv_to_matrix(char *filename, int header);
- matrix make_matrix(int rows, int cols);
- void zero_matrix(matrix m);
- void copy(double *x, double *y, int n);
- double dist(double *x, double *y, int n);
- int *sample(int n);
- int find_int_arg(int argc, char **argv, char *arg, int def);
- int find_arg(int argc, char* argv[], char *arg);
- int closest_center(double *datum, matrix centers)
- {
- int j;
- int best = 0;
- double best_dist = dist(datum, centers.vals[best], centers.cols);
- for(j = 0; j < centers.rows; ++j){
- double new_dist = dist(datum, centers.vals[j], centers.cols);
- if(new_dist < best_dist){
- best_dist = new_dist;
- best = j;
- }
- }
- return best;
- }
- double dist_to_closest_center(double *datum, matrix centers)
- {
- int ci = closest_center(datum, centers);
- return dist(datum, centers.vals[ci], centers.cols);
- }
- int kmeans_expectation(matrix data, int *assignments, matrix centers)
- {
- int i;
- int converged = 1;
- for(i = 0; i < data.rows; ++i){
- int closest = closest_center(data.vals[i], centers);
- if(closest != assignments[i]) converged = 0;
- assignments[i] = closest;
- }
- return converged;
- }
- void kmeans_maximization(matrix data, int *assignments, matrix centers)
- {
- int i,j;
- int *counts = calloc(centers.rows, sizeof(int));
- zero_matrix(centers);
- for(i = 0; i < data.rows; ++i){
- ++counts[assignments[i]];
- for(j = 0; j < data.cols; ++j){
- centers.vals[assignments[i]][j] += data.vals[i][j];
- }
- }
- for(i = 0; i < centers.rows; ++i){
- if(counts[i]){
- for(j = 0; j < centers.cols; ++j){
- centers.vals[i][j] /= counts[i];
- }
- }
- }
- }
- double WCSS(matrix data, int *assignments, matrix centers)
- {
- int i, j;
- double sum = 0;
-
- for(i = 0; i < data.rows; ++i){
- int ci = assignments[i];
- sum += (1 - dist(data.vals[i], centers.vals[ci], data.cols));
- }
- return sum / data.rows;
- }
- typedef struct{
- int *assignments;
- matrix centers;
- } model;
- void smart_centers(matrix data, matrix centers) {
- int i,j;
- copy(data.vals[rand()%data.rows], centers.vals[0], data.cols);
- double *weights = calloc(data.rows, sizeof(double));
- int clusters = centers.rows;
- for (i = 1; i < clusters; ++i) {
- double sum = 0;
- centers.rows = i;
- for (j = 0; j < data.rows; ++j) {
- weights[j] = dist_to_closest_center(data.vals[j], centers);
- sum += weights[j];
- }
- double r = sum*((double)rand()/RAND_MAX);
- for (j = 0; j < data.rows; ++j) {
- r -= weights[j];
- if(r <= 0){
- copy(data.vals[j], centers.vals[i], data.cols);
- break;
- }
- }
- }
- free(weights);
- }
- void random_centers(matrix data, matrix centers){
- int i;
- int *s = sample(data.rows);
- for(i = 0; i < centers.rows; ++i){
- copy(data.vals[s[i]], centers.vals[i], data.cols);
- }
- free(s);
- }
- model do_kmeans(matrix data, int k)
- {
- matrix centers = make_matrix(k, data.cols);
- int *assignments = calloc(data.rows, sizeof(int));
- smart_centers(data, centers);
- //random_centers(data, centers);
- if(k == 1) kmeans_maximization(data, assignments, centers);
- while(!kmeans_expectation(data, assignments, centers)){
- kmeans_maximization(data, assignments, centers);
- }
- model m;
- m.assignments = assignments;
- m.centers = centers;
- return m;
- }
- int main(int argc, char *argv[])
- {
- if(argc < 3){
- fprintf(stderr, "usage: %s <csv-file> [points/centers/stats]\n", argv[0]);
- return 0;
- }
- int i,j;
- srand(time(0));
- matrix data = csv_to_matrix(argv[1], 0);
- int k = find_int_arg(argc, argv, "-k", 2);
- int header = find_arg(argc, argv, "-h");
- int count = find_arg(argc, argv, "-c");
- if(strcmp(argv[2], "assignments")==0){
- model m = do_kmeans(data, k);
- int *assignments = m.assignments;
- for(i = 0; i < k; ++i){
- if(i != 0) printf("-\n");
- for(j = 0; j < data.rows; ++j){
- if(!(assignments[j] == i)) continue;
- printf("%f, %f\n", data.vals[j][0], data.vals[j][1]);
- }
- }
- }else if(strcmp(argv[2], "centers")==0){
- model m = do_kmeans(data, k);
- printf("WCSS: %f\n", WCSS(data, m.assignments, m.centers));
- int *counts = 0;
- if(count){
- counts = calloc(k, sizeof(int));
- for(j = 0; j < data.rows; ++j){
- ++counts[m.assignments[j]];
- }
- }
- for(j = 0; j < m.centers.rows; ++j){
- if(count) printf("%d, ", counts[j]);
- printf("%f, %f\n", m.centers.vals[j][0], m.centers.vals[j][1]);
- }
- }else if(strcmp(argv[2], "scan")==0){
- for(i = 1; i <= k; ++i){
- model m = do_kmeans(data, i);
- printf("%f\n", WCSS(data, m.assignments, m.centers));
- }
- }
- return 0;
- }
- // Utility functions
- int *sample(int n)
- {
- int i;
- int *s = calloc(n, sizeof(int));
- for(i = 0; i < n; ++i) s[i] = i;
- for(i = n-1; i >= 0; --i){
- int swap = s[i];
- int index = rand()%(i+1);
- s[i] = s[index];
- s[index] = swap;
- }
- return s;
- }
- double dist(double *x, double *y, int n)
- {
- int i;
- double mw = (x[0] < y[0]) ? x[0] : y[0];
- double mh = (x[1] < y[1]) ? x[1] : y[1];
- double inter = mw*mh;
- double sum = x[0]*x[1] + y[0]*y[1];
- double un = sum - inter;
- double iou = inter/un;
- return 1-iou;
- }
- void copy(double *x, double *y, int n)
- {
- int i;
- for(i = 0; i < n; ++i) y[i] = x[i];
- }
- void error(char *s){
- fprintf(stderr, "Error: %s\n", s);
- exit(-1);
- }
- char *fgetl(FILE *fp)
- {
- if(feof(fp)) return 0;
- int size = 512;
- char *line = malloc(size*sizeof(char));
- if(!fgets(line, size, fp)){
- free(line);
- return 0;
- }
- int curr = strlen(line);
- while(line[curr-1]!='\n'){
- size *= 2;
- line = realloc(line, size*sizeof(char));
- if(!line) error("Malloc");
- fgets(&line[curr], size-curr, fp);
- curr = strlen(line);
- }
- line[curr-1] = '\0';
- return line;
- }
- // Matrix stuff
- int count_fields(char *line)
- {
- int count = 0;
- int done = 0;
- char *c;
- for(c = line; !done; ++c){
- done = (*c == '\0');
- if(*c == ',' || done) ++count;
- }
- return count;
- }
- double *parse_fields(char *l, int n)
- {
- int i;
- double *field = calloc(n, sizeof(double));
- for(i = 0; i < n; ++i){
- field[i] = atof(l);
- l = strchr(l, ',')+1;
- }
- return field;
- }
- matrix make_matrix(int rows, int cols)
- {
- matrix m;
- m.rows = rows;
- m.cols = cols;
- m.vals = calloc(m.rows, sizeof(double *));
- int i;
- for(i = 0; i < m.rows; ++i) m.vals[i] = calloc(m.cols, sizeof(double));
- return m;
- }
- void zero_matrix(matrix m)
- {
- int i, j;
- for(i = 0; i < m.rows; ++i){
- for(j = 0; j < m.cols; ++j) m.vals[i][j] = 0;
- }
- }
- matrix csv_to_matrix(char *filename, int header)
- {
- FILE *fp = fopen(filename, "r");
- if(!fp) error(filename);
- matrix m;
- m.cols = -1;
- char *line;
- int n = 0;
- int size = 1024;
- m.vals = calloc(size, sizeof(double*));
- if(header) fgetl(fp);
- while((line = fgetl(fp))){
- if(m.cols == -1) m.cols = count_fields(line);
- if(n == size){
- size *= 2;
- m.vals = realloc(m.vals, size*sizeof(double*));
- }
- m.vals[n] = parse_fields(line, m.cols);
- free(line);
- ++n;
- }
- m.vals = realloc(m.vals, n*sizeof(double*));
- m.rows = n;
- return m;
- }
- // Arguement parsing
- void del_arg(int argc, char **argv, int index)
- {
- int i;
- for(i = index; i < argc-1; ++i) argv[i] = argv[i+1];
- argv[i] = 0;
- }
- int find_arg(int argc, char* argv[], char *arg)
- {
- int i;
- for(i = 0; i < argc; ++i) {
- if(!argv[i]) continue;
- if(0==strcmp(argv[i], arg)) {
- del_arg(argc, argv, i);
- return 1;
- }
- }
- return 0;
- }
- int find_int_arg(int argc, char **argv, char *arg, int def)
- {
- int i;
- for(i = 0; i < argc-1; ++i){
- if(!argv[i]) continue;
- if(0==strcmp(argv[i], arg)){
- def = atoi(argv[i+1]);
- del_arg(argc, argv, i);
- del_arg(argc, argv, i);
- break;
- }
- }
- return def;
- }
- float find_float_arg(int argc, char **argv, char *arg, float def)
- {
- int i;
- for(i = 0; i < argc-1; ++i){
- if(!argv[i]) continue;
- if(0==strcmp(argv[i], arg)){
- def = atof(argv[i+1]);
- del_arg(argc, argv, i);
- del_arg(argc, argv, i);
- break;
- }
- }
- return def;
- }
- char *find_char_arg(int argc, char **argv, char *arg, char *def)
- {
- int i;
- for(i = 0; i < argc-1; ++i){
- if(!argv[i]) continue;
- if(0==strcmp(argv[i], arg)){
- def = argv[i+1];
- del_arg(argc, argv, i);
- del_arg(argc, argv, i);
- break;
- }
- }
- return def;
- }
|