#!/usr/perl/perl580/bin/perl use Algorithm::Cluster; ## This library is located at /aregano/datasets/work/lobo/Algorithm-Cluster-1.24/lib/site_perl/5.6.1/i686-linux ## In the perl command line, you must give the path to te library always in the format ## perl -I[path name] or it will not work. my $i = 0; my $firstline; my $i; open (DATA, "$ARGV[0]") || die; open (OUT, ">$ARGV[0].kmeans.$ARGV[1].$ARGV[2]") || die; my $firstline = ; # Skip the title line my @generating_feature_number = split (/\t/, "$firstline"); shift @generating_feature_number; my $feature_number = $#generating_feature_number; ## while(my $line = ) { chomp $line; @field = split /\t/, $line; $patient_id[$i] = shift @field; $patient_class[$i] = shift @field; $j = 0; $patient_data[$i] = [ @field ]; #this is the where the data will be stored; the "[]" around #the array indicates the creationof a matrix to perl. print ("$patient_id[$i]\n"); $i++; } $number_of_patients_loop = $#patient_data; $number_of_features_loop = $#{$patient_data[$patient_data]}; close(DATA); ## The code below is useful to check if the matrix in line 23 is correct; ## it's needed only in new datasets to check if the work is right. #$i = 0; #for $i (0..$#patient_data) { # for $j (0..$#{$patient_data[$i]}) { # print ("\t Element $i $j is $patient_data[$i][$j]\n"); # } #} ## The hash generated below is the input to the subroutine ## treecluster; for more information about the meaning of ## the paramenters please read http://bonsai.ims.u-tokyo.ac.jp/~mdehoon/software/cluster/cluster.pdf $run_number = $ARGV[1]; $cluster_number = $ARGV[2]; my %params = ( nclusters => $cluster_number, data => \@patient_data, mask => '', weight => '', transpose => 0, npass => $run_number, #number of iteractions; method => 'a', dist => 'e', ); my ($clusterid, $centroids, $found) = Algorithm::Cluster::kcluster(%params); printf("\n"); printf("Clustering data set:\n\n"); $i=0; print OUT ("entity\tsample_ID\tsample_class\tcluster\n"); foreach(@{$clusterid}) { $all_elements[$i] = $i; $output[$i] = ("$i,$_"); printf OUT ("%2d\t%2s\t%2s\t%2d\n", $i, $patient_id[$i], $patient_class[$i], $_); printf ("%2d\t%2s\t%2s\t%2d\n", $i, $patient_id[$i], $patient_class[$i], $_); $i++; } print OUT ("\n"); print OUT ("\n"); foreach $element (@all_elements) { print ("working with $element\t, loop $floating_position_of_x\n"); $floating_position_of_x = 0; while ($floating_position_of_x <= $#all_elements) { $soma = 0; $floating_position_of_y = 0; for $floating_position_of_y (0..$feature_number) { if ($floating_position_of_x == $element) { $floating_position_of_x++; } $difference = $patient_data[$element][$floating_position_of_y] - $patient_data[$floating_position_of_x][$floating_position_of_y]; $difference_square = $difference**2; $soma += $difference_square; $floating_position_of_y++; } $soma = $soma**(1/2); $distance_id = ("$element\t$floating_position_of_x"); $distance{$distance_id} = $soma; $floating_position_of_x++; } } $i = 0; while ($i < $cluster_number) { # print ("This is the imput of the subroutine making_clusters: @output"); @cluster = ''; @centroid = ''; @homogeneity = ''; @separation = ''; making_clusters (@output); #This subroutine will return an array containing the elements of cluster $i; # print ("cluster $i has this elements: @cluster\n"); # print ("\n"); $loop_control = 0; while ($loop_control <= $feature_number) { generating_centroids (@cluster); $loop_control++; } # print ("This is the centroid I'm working now: @centroid\n"); calculating_homogeneity (@centroid); print OUT ("This is the homogeneity of cluster $i: $homogeneity_value\n"); print ("This is the homogeneity of cluster $i: $homogeneity_value\n"); calculating_separation (@cluster); $soma = 0; foreach $separation_value (@separation) { $soma += $separation_value; } $separation_final_value = ($soma/$divide); print OUT ("This is the separation of cluster $i: $separation_final_value\n"); print ("This is the separation of cluster $i: $separation_final_value\n"); $product = ($homogeneity_value*$separation_final_value); print OUT ("This is h X s of cluster $i: $product\n"); print ("This is h X s of cluster $i: $product\n"); print ("\n"); $i++; } print OUT ("\n"); print OUT ("Number of times this program run = $run_number\nNumber of times this program found the optimal solution: $found\n"); #for $j (0..$#matrix) { # for $k (0..$#{$matrix[$matrix]}) { # print ("\t Element is $j $k is $matrix[$j][$k]\n"); # } #} ####################################SUBROUTINES###################################### sub making_clusters { $e = 0; foreach $element (@output) { @temp = split (/,/, $element); if ($temp[1] == $i) { $cluster[$e] = $temp[0]; $e++; } } } ###################################################################################### sub generating_centroids { $soma = 0; # print ("now I'm working with elements @cluster\n"); foreach $element (@cluster) { $soma += $patient_data[$element][$loop_control]; } $result = ($soma/($#cluster+1)); # print ("this is my centroid in loop $loop_control: @centroid\n"); $centroid[$loop_control] = $result; } ###################################################################################### sub calculating_homogeneity { $loop_control3 = 0; foreach $patient (@cluster) { $loop_control2 = 0; $soma = 0; foreach $centroid_value(@centroid) { $difference = ($patient_data[$patient][$loop_control2] - $centroid_value); $difference_square = $difference**2; $soma += $difference_square; # print ("the distance of $patient\'s feature ($patient_data[$patient][$loop_control2]) for it's centroid ($centroid_value) is $difference\n"); # $a = ; $loop_control2++; } $distance = $soma**(1/2); # print ("$distance\n"); $homogeneity[$loop_control3] = $distance; $loop_control3++; # print ("this is the vector that will be summed to generate the homogeneity value: @homogeneity\n"); } $soma = 0; foreach $element (@homogeneity) { $soma += $element; } # print ("this is the sum: $soma\n"); # print ("this is the dividendum: $#homogeneity\n"); $homogeneity_value = (1/(1+($soma/($#homogeneity+1)))); } ###################################################################################### sub calculating_separation { # print ("@cluster\n"); $stupid_loop = 0; $divide = 0; $separation_external_loop = 0; @number_of_elements = (); while ($stupid_loop <= $#patient_data) { $number_of_elements[$stupid_loop] = 0; $stupid_loop++; } # print ("@number_of_elements\n"); foreach $element (@cluster) { $number_of_elements[$element] = 1; } # print ("@number_of_elements\n"); foreach $element (@cluster) { $floating_position_of_x = 0; $external_loop_control = 0; @final_sum = (); while ($floating_position_of_x <= $number_of_patients_loop) { # print ("$number_of_patients_loop\n"); # print ("$floating_position_of_x\n"); # $a = ; if ($number_of_elements[$floating_position_of_x] == 1) { ##This conditional statement indicates $floating_position_of_x++; # print ("Blah!!\n"); next; } $actual_distance = ("$element\t$floating_position_of_x"); $separation[$divide] = $distance{$actual_distance}; $floating_position_of_x++; $divide++; } } } ####Trash (may be util sometime)#### #$centroid_id = 0; #$external_loop_control = $#matrix; #@separation = ''; #while ($centroid_id <= $external_loop_control) { ##this loop will iterate as much times as the number of clusters; # $soma = 0; # $floating_position_of_x = 0; # $separation_x = 0; ## print ("Now I'm in loop $centroid_id\n"); ## print ("$#matrix\n"); ## print ("$#{$matrix[$matrix]}\n"); ## print ("$external_loop_control\n"); # while ($floating_position_of_x <= $external_loop_control) { # $feature_id = 0; # while ($feature_id <= $#{$matrix[$matrix]}) { # if ($centroid_id == $floating_position_of_x) { # if ($centroid_id == $external_loop_control) { # last; # } # $floating_position_of_x++; # } # for $la (0..$#matrix) { # for $lala (0..$#{$matrix[$matrix]}) { # print ("$matrix[$la][$lala]\t"); # } # print ("\n"); # } # $separation_difference = ($matrix[$centroid_id][$feature_id] - $matrix[$floating_position_of_x][$feature_id]); # $separation_square = $separation_difference**2; # print ("Now I'm looking centroid $centroid_id; line $floating_position_of_x.\n Feature $feature_id has a value of $matrix[$centroid_id][$feature_id], and it's differenece of the feature centroid $floating_position_of_x ($matrix[$floating_position_of_x][$feature_id]) is $separation_difference, it's square is $separation_square\n"); # $separation[$separation_x][$feature_id] = $separation_square; # print ("element $separation_x $feature_id is $separation[$separation_x][$feature_id]\n"); # $a = ; # $feature_id++; # } # $separation_x++; # $floating_position_of_x++; # } # $separation_line = 0; # @almost_done = ''; # while ($separation_line <= $#separation) { # $separation_col = 0; # $soma = 0; # while ($separation_col <= $#{$separation[$separation]}) { ## for $la (0..$#separation) { ## for $lala (0..$#{$separation[$separation]}) { ## print ("$separation[$la][$lala]\t"); ## } ## print ("\n"); ## } ## print ("$separation[$separation_line][$separation_col]\n"); ## $a = ; # $soma += $separation[$separation_line][$separation_col]; ## print ("This is the sum: $soma\n"); # $separation_col++; # } # $soma_root = $soma**(1/2); ## print ("This is the square root of $soma: $soma_root\n"); # $almost_done[$separation_line] = $soma_root; # $separation_line++; # } # $soma_final = 0; ## print ("this is the array that is the root of the sum of the square of the difference of the centroids: @almost_done\n"); ## $a = ; # foreach $element (@almost_done) { # $soma_final += $element; # } ## print ("$soma_final\n"); # $soma_final = ($soma_final/($#almost_done+1)); # $separation[$centroid_id] = $soma_final; ## print ("\@separation index $centroid_id is $separation[$centroid_id] ($soma_final)\n"); ## $a = ; ## $centroid_id++; #} ##print ("@separation is the problem...\n"); #$i = 0; #foreach (@separation) { # print ("This is the separation of cluster $i: $_\n"); # print ("\n"); # $i++; #}