#!/usr/perl/perl580/bin/perl use Algorithm::Cluster; ## This library is located at /aregano/datasets/work/lobo/Algorithm-Cluster-1.24/lib/site_perl/5.6.1/i686-linux ## In the perl command line, you must give the path to te library always in the format ## perl -I[path name] or it will not work. my $i = 0; my $firstline; my $i; open (DATA, "$ARGV[0]"); open (OUT, ">$ARGV[0].out.hierarquico.$ARGV[1].$ARGV[2]"); if ($ARGV[3] eq '') { open (OUT2, ">$ARGV[0].dist"); } $firstline = ; # Skip the title line my @generating_feature_number = split (/\t/, "$firstline"); shift @generating_feature_number; my $feature_number = $#generating_feature_number; while(my $line = ) { chomp $line; @field = split /\t/, $line; $patient_id[$i] = shift @field; $patient_class[$i] = shift @field; $patient_data[$i] = [ @field ]; #this is the where the data will be stored; the "[]" around #the array indicates the creation of a matrix to perl. $i++; } $number_of_patients_loop = $#patient_data; $number_of_features_loop = $#{$patient_data[$patient_data]}; print ("$number_of_patients_loop\n"); close(DATA); print ("Initiating clustering\n"); print ("Generating distance matrix\n"); if ($ARGV[3] eq '') { for $i (0..($number_of_patients_loop)) { $all_elements[$i] = $i; } foreach $element (@all_elements) { $floating_position_of_x = 0; while ($floating_position_of_x <= $#all_elements) { $soma = 0; $floating_position_of_y = 0; $control = 0; for $floating_position_of_y (0..$feature_number) { if ($floating_position_of_x == $element) { $floating_position_of_x++; } if ($floating_position_of_x <= $#all_elements) { $difference = $patient_data[$element][$floating_position_of_y] - $patient_data[$floating_position_of_x][$floating_position_of_y]; $difference_square = $difference**2; $soma += $difference_square; $floating_position_of_y++; } } print ("Calculating the distance of $element and $floating_position_of_x\n"); $soma = $soma**(1/2); $distance_id = ("$element\t$floating_position_of_x"); $distance{$distance_id} = $soma; print OUT2 ("$distance_id;$distance{$distance_id}\n"); $floating_position_of_x++; } } } if ($ARGV[3] ne '') { $dic = $ARGV[3]; open (IN2, "$dic"); while ($line = ) { chomp $line; ($distance_id, $value) = split (/;/, $line); $distance{$distance_id} = $value; } } print ("matrix distance generated\n"); $method = $ARGV[1]; $dist = $ARGV[2]; if ($method eq 'a') { print OUT ("Cluster method - Pairwise average linkage\t"); } if ($method eq 'c') { print OUT ("Cluster method - Pairwise centroid linkage\t"); } if ($dist eq 'e') { print OUT ("Distance - Euclidean\n"); } if ($dist eq 'b') { print OUT ("Distance - City Block\n"); } ## The hash generated below is the input to the subroutine ## treecluster; for more information about the meaning of ## the paramenters please read http://bonsai.ims.u-tokyo.ac.jp/~mdehoon/software/cluster/cluster.pdf my %params = ( applyscale => 0, transpose => 0, method => $method, dist => $dist, data => \@patient_data, mask => '', weight => '', ); my ($result, $linkdist); ($result, $linkdist) = Algorithm::Cluster::treecluster(%params); $i=0; print ("Cluster process finished\n"); print OUT ("sample_id\tsample\tclass\n"); foreach (@patient_id) { print OUT ("$i\t$patient_id[$i]\t$patient_class[$i]\n"); $i++; } print OUT ("\n"); print OUT ("----------------"); print OUT ("\n"); $i = 0; $number_of_clusters++; print OUT ("cluster\tentity_1\tentity_2\n"); foreach(@{$result}) { printf OUT ("%3d:\t%3d\t%3d\t\n",-1-$i,$_->[0],$_->[1]); printf ("%3d:\t%3d\t%3d\t\n",-1-$i,$_->[0],$_->[1]); $index = (-1-$i); $element_1 = $_->[0]; $element_2 = $_->[1]; $cluster{$index} = ("$element_1,$element_2"); $i++; $number_of_clusters++; } ## The main idea of this loop is to get all entities that are in a cluster. Sometimes one ## or both the elements of a cluster are clusters thenselves, so first this loop will get ## all truly entities, and create arrays that contains the IDs of elements that arent't ## entities. $cluster_id = -1; $i = 1; print OUT ("----------------\n"); print OUT ("Cluster\tHomogeneity\tSeparation\tH x S\n"); while ($i < $number_of_clusters) { @elements = (); $loop_control = 0; @element_negative_tag = (); ##This tag will control if the cluster contains only positive number (numbers that represent entities, not clusters); $element_negative_tag_index = 0; ($element_1, $element_2) = split (/,/, $cluster{$cluster_id}); getting_cluster_elements ($element_1, $element_2); ## Until this point, we'll have the array @elements with zero, one or two elements ## that are really elements and the array @element_negative_tag with zero, one or ## two elements tha are clusters that need to be transformed in it's elements. until (@element_negative_tag == '') { $negative_element = shift @element_negative_tag; ($element_1,$element_2) = split (/,/, $cluster{$negative_element}); $element_negative_tag_index--; ## This marretada is useful to avoid blank spaces. ## The $element_negative_tag_index is being used ## as index in the subroutine getting_cluster_elements ## two times. Hope this cheat help you in the future. getting_cluster_elements ($element_1, $element_2); } # print ("After look into cluster $cluster_id I've found it has this elements: @elements\n"); $generating_centroid_loop_control = 0; @centroid = (); while ($generating_centroid_loop_control <= $feature_number) { generating_centroids (@elements); $generating_centroid_loop_control++; } # print ("This is the centroid for cluster $cluster_id: @centroid\n"); $loop_control4 = 0; calculating_homogeneity (@centroid); print OUT ("$cluster_id\t$homogeneity_value\t"); print ("This is the homogeneity of cluster $cluster_id: $homogeneity_value\n"); $loop_4 = 0; calculating_separation (@elements); $soma = 0; print ("$divide\n"); foreach $separation_value (@separation) { $soma += $separation_value; } $separation_final_value = ($soma/$divide); print OUT ("$separation_final_value\t"); print ("This is the separation of cluster $cluster_id: $separation_final_value, divided by $divide\n"); $product = ($separation_final_value*$homogeneity_value); print OUT ("$product\n"); print ("This is the H x S of cluster $cluster_id: $product\n"); print ("\n"); # print ("@separation\n"); $i++; $cluster_id--; } ########################################SUBROUTINES######################################## sub getting_cluster_elements { if (($element_1 >= 0) && ($element_2 >= 0)) { $elements[$loop_control] = $element_1; $loop_control++; $elements[$loop_control] = $element_2; $loop_control++; ##Here both elements are entities. } elsif (($element_1 <= 0) && ($element_2 >= 0)) { $element_negative_tag[$element_negative_tag_index] = $element_1; $element_negative_tag_index++; $elements[$loop_control] = $element_2; $loop_control++; ##Here only element_2 is really a element; element_1 is a cluster, so we must walk down in the tree to get it's entities } elsif (($element_1 >= 0) && ($element_2 <= 0)) { $element_negative_tag[$element_negative_tag_index] = $element_2; $element_negative_tag_index++; $elements[$loop_control] = $element_1; $loop_control++; ##Here only element_1 is really a element; element_2 is a cluster, so we must walk down in the tree to get it's entities } else { $element_negative_tag[$element_negative_tag_index] = $element_1; $element_negative_tag_index++; $element_negative_tag[$element_negative_tag_index] = $element_2; $element_negative_tag_index++; ##Here both elements are clusters. } } ########################################################################################### sub generating_centroids { $soma = 0; foreach $element (@elements) { $soma += $patient_data[$element][$generating_centroid_loop_control]; } $result = ($soma/($#elements+1)); $centroid[$generating_centroid_loop_control] = $result; } ########################################################################################### sub calculating_homogeneity { $loop_control3 = 0; foreach $patient (@elements) { $loop_control2 = 0; $soma = 0; foreach $centroid_value(@centroid) { $difference = ($patient_data[$patient][$loop_control2] - $centroid_value); $difference_square = $difference**2; $soma += $difference_square; # print ("the distance of $patient\'s feature ($patient_data[$patient][$loop_control2]) for it's centroid ($centroid_value) is $difference\n"); $loop_control2++; } $distance = $soma**(1/2); # print ("$distance\n"); $homogeneity[$loop_control3] = $distance; $loop_control3++; # print ("this is the vector that will be summed to generate the homogeneity value: @homogeneity\n"); } $soma = 0; foreach $element (@homogeneity) { $soma += $element; } # print ("this is the sum: $soma\n"); # print ("this is the dividendum: $#homogeneity\n"); $homogeneity_value = (1/(1+($soma/($#homogeneity+1)))); } ########################################################################################### sub calculating_separation { $stupid_loop = 0; $divide = 1; $separation_external_loop = 0; @number_of_elements = (); @separation = (); while ($stupid_loop <= $#patient_data) { $number_of_elements[$stupid_loop] = 0; $stupid_loop++; } foreach $element (@elements) { $number_of_elements[$element] = 1; } foreach $element (@elements) { $floating_position_of_x = 0; while ($floating_position_of_x <= $number_of_patients_loop) { # print ("$number_of_patients_loop\n"); # print ("$floating_position_of_x\n"); $feature_id = 0; if ($number_of_elements[$floating_position_of_x] == 1) { ##This conditional statement indicates $floating_position_of_x++; # print ("Blah!!\n"); next; } $actual_distance = ("$element\t$floating_position_of_x"); $separation[$divide] = $distance{$actual_distance}; $floating_position_of_x++; $divide++; } } }