#!/usr/perl/perl580/bin/perl use Algorithm::Cluster; ## This library is located at /aregano/datasets/work/lobo/Algorithm-Cluster-1.24/lib/site_perl/5.6.1/i686-linux ## In the perl command line, you must give the path to te library always in the format ## perl -I[path name] or it will not work. my $i = 0; my $firstline; my $i; open (DATA, "$ARGV[0]"); open (OUT, ">$ARGV[0].out"); open (OUT2, ">$ARGV[0].confere"); my $firstline = ; # Skip the title line my @generating_feature_number = split (/\t/, "$firstline"); shift @generating_feature_number; my $feature_number = $#generating_feature_number; ## while(my $line = ) { chomp $line; @field = split /\t/, $line; $patient_id[$i] = shift @field; $patient_class[$i] = shift @field; $j = 0; $patient_data[$i] = [ @field ]; #this is the where the data will be stored; the "[]" around #the array indicates the creationof a matrix to perl. print ("Patient $patient_id[$i] is $patient_class[$i]\n"); $i++; } close(DATA); ## The code below is useful to check if the matrix in line 23 is correct; ## it's needed only in new datasets to check if the work is right. #$i = 0; #for $i (0..$#patient_data) { # for $j (0..$#{$patient_data[$i]}) { # print ("\t Element $i $j is $patient_data[$i][$j]\n"); # } #} ## The hash generated below is the input to the subroutine ## treecluster; for more information about the meaning of ## the paramenters please read http://bonsai.ims.u-tokyo.ac.jp/~mdehoon/software/cluster/cluster.pdf $run_number = $ARGV[1]; $cluster_number = $ARGV[2]; my %params = ( nclusters => $cluster_number, data => \@patient_data, mask => '', weight => '', transpose => 0, npass => $run_number, #number of iteractions; method => 'a', dist => 'e', ); my ($clusterid, $centroids, $found) = Algorithm::Cluster::kcluster(%params); printf("\n"); printf("Clustering data set:\n\n"); $i=0; foreach(@{$clusterid}) { $output[$i] = ("$i,$_"); printf("Patient %2d (%2s) belongs to cluster\t%2d\n",$i++,$patient_class[$i-1],$_); } print ("\n"); print ("\n"); $i = 0; while ($i < $cluster_number) { # print ("This is the imput of the subroutine making_clusters: @output"); @cluster = ''; @centroid = ''; @homogeneity = ''; making_clusters (@output); #This subroutine will return an array containing the elements of cluster $i; # print ("cluster $i has this elements: @cluster\n"); # print ("\n"); $loop_control = 0; while ($loop_control <= $feature_number) { generating_centroids (@cluster); $loop_control++; } # print ("This is the centroid I'm working now: @centroid\n"); calculating_homogeneity (@centroid); $loop_control4 = 0; foreach (@centroid) { $matrix[$i][$loop_control4] = $centroid[$loop_control4]; # print ("$centroid[$loop_control4]\n"); $loop_control4++; } print ("This is the homogeneity of cluster $i: $homogeneity_value\n"); print ("\n"); $i++; } $centroid_id = 0; $external_loop_control = $#matrix; @separation = ''; while ($centroid_id <= $external_loop_control) { ##this loop will iterate as much times as the number of clusters; $soma = 0; $floating_position_of_x = 0; $separation_x = 0; # print ("Now I'm in loop $centroid_id\n"); # print ("$#matrix\n"); # print ("$#{$matrix[$matrix]}\n"); # print ("$external_loop_control\n"); while ($floating_position_of_x <= $external_loop_control) { $feature_id = 0; while ($feature_id <= $#{$matrix[$matrix]}) { if ($centroid_id == $floating_position_of_x) { if ($centroid_id == $external_loop_control) { last; } $floating_position_of_x++; } # for $la (0..$#matrix) { # for $lala (0..$#{$matrix[$matrix]}) { # print ("$matrix[$la][$lala]\t"); # } # print ("\n"); # } $separation_difference = ($matrix[$centroid_id][$feature_id] - $matrix[$floating_position_of_x][$feature_id]); $separation_square = $separation_difference**2; # print ("Now I'm looking centroid $centroid_id; line $floating_position_of_x.\n Feature $feature_id has a value of $matrix[$centroid_id][$feature_id], and it's differenece of the feature centroid $floating_position_of_x ($matrix[$floating_position_of_x][$feature_id]) is $separation_difference, it's square is $separation_square\n"); $separation[$separation_x][$feature_id] = $separation_square; # print ("element $separation_x $feature_id is $separation[$separation_x][$feature_id]\n"); # $a = ; $feature_id++; } $separation_x++; $floating_position_of_x++; } $separation_line = 0; @almost_done = ''; while ($separation_line <= $#separation) { $separation_col = 0; $soma = 0; while ($separation_col <= $#{$separation[$separation]}) { # for $la (0..$#separation) { # for $lala (0..$#{$separation[$separation]}) { # print ("$separation[$la][$lala]\t"); # } # print ("\n"); # } # print ("$separation[$separation_line][$separation_col]\n"); # $a = ; $soma += $separation[$separation_line][$separation_col]; # print ("This is the sum: $soma\n"); $separation_col++; } $soma_root = $soma**(1/2); # print ("This is the square root of $soma: $soma_root\n"); $almost_done[$separation_line] = $soma_root; $separation_line++; } $soma_final = 0; # print ("this is the array that is the root of the sum of the square of the difference of the centroids: @almost_done\n"); # $a = ; foreach $element (@almost_done) { $soma_final += $element; } # print ("$soma_final\n"); $soma_final = ($soma_final/($#almost_done+1)); $separation[$centroid_id] = $soma_final; # print ("\@separation index $centroid_id is $separation[$centroid_id] ($soma_final)\n"); # $a = ; $centroid_id++; } #print ("@separation is the problem...\n"); $i = 0; foreach (@separation) { print ("This is the separation of cluster $i: $_\n"); print ("\n"); $i++; } printf("\n"); print ("Number of times this program run = $run_number\nNumber of times this program found the optimal solution: $found\n"); #for $j (0..$#matrix) { # for $k (0..$#{$matrix[$matrix]}) { # print ("\t Element is $j $k is $matrix[$j][$k]\n"); # } #} ####################################SUBROUTINES###################################### sub making_clusters { $e = 0; foreach $element (@output) { @temp = split (/,/, $element); if ($temp[1] == $i) { $cluster[$e] = $temp[0]; $e++; } } } ###################################################################################### sub generating_centroids { $soma = 0; # print ("now I'm working with elements @cluster\n"); foreach $element (@cluster) { $soma += $patient_data[$element][$loop_control]; } $result = ($soma/($#cluster+1)); # print ("this is my centroid in loop $loop_control: @centroid\n"); $centroid[$loop_control] = $result; } ###################################################################################### sub calculating_homogeneity { $loop_control3 = 0; foreach $patient (@cluster) { $loop_control2 = 0; $soma = 0; foreach $centroid_value(@centroid) { $difference = ($patient_data[$patient][$loop_control2] - $centroid_value); $difference_square = $difference**2; $soma += $difference_square; # print ("the distance of $patient\'s feature ($patient_data[$patient][$loop_control2]) for it's centroid ($centroid_value) is $difference\n"); $loop_control2++; } $distance = $soma**(1/2); # print ("$distance\n"); $homogeneity[$loop_control3] = $distance; $loop_control3++; # print ("this is the vector that will be summed to generate the homogeneity value: @homogeneity\n"); } $soma = 0; foreach $element (@homogeneity) { $soma += $element; } # print ("this is the sum: $soma\n"); # print ("this is the dividendum: $#homogeneity\n"); $homogeneity_value = (1/(1+($soma/($#homogeneity+1)))); } ######################################################################################