Anyway! The result is here, the kMeans module is part of the script, but you can use it as a general library for kMeans clustering too, if you'd want to. It's not even limited to 2D. :) All you have to do is copy the bottom half of the program into its own pm file.
Enjoy!
#!/usr/bin/perl -w
# k Means demo program
# Adapted from a VB program by Kardi Teknomo
# http://people.revoledu.com/kardi/tutorial/kMean/index.html
# Ported to Perl/Tk by bart @ Perlmonks
use strict;
# ----------------------- Tk interface -------------------------
use Tk;
my $mw = MainWindow->new( -height => 403, -width => 477,
-title => "k Means Clustering, adapted from tutorial by Kardi Teknomo" );
my $button_reset = $mw->Button(-text => "Clear Data", -relief => "raised", -command => \&reset_click);
$button_reset->place( -x => 176, -y => 24, -height => 25, -width => 65);
{
my $label = $mw->Label(
-text => "Click data in the picture box below. The program will automatically cluster the data by color code."
);
$label->place( -x => 0, -y => 0, -height => 16, -width => 473);
$label = $mw->Label(-text => "Number of clusters");
$label->place( -x => 10, -y => 28, -height => 18, -width => 95);
$label = $mw->Label(-text => "(X, Y)", -justify => 'right');
$label->place( -x => 280, -y => 24, -height => 13, -width => 40);
}
my $label_xy = $mw->Label( -text => "X, Y");
$label_xy->place( -x => 330, -y => 24, -height => 13, -width => 50);
my $clusters_entry = $mw->Entry(-relief => "sunken", -disabledforeground => 'darkgray');
$clusters_entry->place( -x => 112, -y => 24, -height => 24, -width => 24);
$clusters_entry->insert('end', '3');
my $canvas = $mw->Scrolled('Canvas', -scrollbars => '', -background => 'white');
$canvas->place( -x => 0, -y => 56, -height => 403-56, -width => 477);
$canvas->CanvasBind( "", [ \&canvas_click, Ev('x'), Ev('y')]);
$canvas->CanvasBind( "", [ \&canvas_mousemove, Ev('x'), Ev('y')]);
$canvas->CanvasBind( "", [ \&canvas_mouseleave ]);
#----------------------- Event Handlers ------------------------
my @color = qw(red yellow green cyan blue purple gray magenta pink
chartreuse coral darkolivegreen);
# If you want to be able to have more clusters, add more colours.
my(%point, @cluster, $dataset);
sub reset_click {
$canvas->delete('all');
(@cluster, %point, $dataset) = ();
$clusters_entry->configure(-state => 'normal');
}
sub canvas_mousemove {
my $canvas = shift;
my($x, $y) = @_;
$label_xy->configure(-text => "($x, $y)");
}
sub canvas_mouseleave {
my $canvas = shift;
$label_xy->configure(-text => "");
}
sub canvas_click {
my $canvas = shift;
my($x, $y) = @_;
if(!$dataset) {
my $clusters = $clusters_entry->get;
if($clusters !~ /^\d+$/ or $clusters == 0 or $clusters > @color) {
warn "Not a valid value for cluster count";
return;
}
$dataset = Data::Cluster::kMean->new(0+$clusters) or die "Failed to make object";
$clusters_entry->configure(-state => 'disabled');
}
$dataset->add(my $point = [ $x, $y ]);
my %record = ( data => $point, cluster => -1 );
$record{id} = $canvas->createLine($x, $y, $x, $y,
-fill => 'red', -width => 8,
-capstyle => 'round', -tags => ['dot'],
);
$point{$point} = \%record;
foreach my $cluster ($dataset->clusters) {
my $i = $cluster->index;
my $r = $cluster[$i] ||= { obj => $cluster, id =>
$canvas->createText($x, $y, -anchor => 'c', -width => 150, tag => 'label',
-text => 1+$i) };
my($x, $y) = @{$cluster->centroid};
$canvas->coords($r->{id}, $x, $y);
foreach my $p ($cluster->points) {
if($point{$p}{cluster} != $i) {
$point{$p}{cluster} = $i;
$canvas->itemconfigure($point{$p}{id}, -fill => $color[$i]);
}
}
}
$canvas->raise('label', 'dot');
}
#------------------------ Main Program -------------------------
MainLoop;
#------------------------ kMean module -------------------------
package Data::Cluster::kMean;
use List::Util qw(sum);
sub new {
my $class = shift;
my($max_clusters) = @_;
return bless { max_clusters => $max_clusters, data => [], cluster => [], clusters => [] }, $class;
}
sub add {
# add data point(s) (array references) -- by reference, so make sure they're not reused for something else
my $self = shift;
return unless @_;
unless(ref $_[0] eq 'ARRAY') {
@_ = [ @_ ];
}
foreach my $p (@_) {
push @{$self->{data}}, $p;
push @{$self->{cluster}}, -1; # not in a cluster
if(@{$self->{clusters}} < $self->{max_clusters}) {
my $index = @{$self->{clusters}};
push @{$self->{clusters}}, Data::Cluster::kMean::Cluster->new($self, $index);
$self->{cluster}[-1] = $index;
} else {
my $c;
{
my $j = 0;
my $min_dist;
for my $cluster (@{$self->{clusters}}) {
my $dist = _dist($p, $cluster->centroid);
if(!defined $min_dist or $dist < $min_dist) {
$c = $j;
$min_dist = $dist;
}
} continue {
$j++;
}
}
$self->{clusters}[$c]->invalidate;
$self->{cluster}[-1] = $c;
my $is_still_moving = 1;
while($is_still_moving) {
# this loop will surely converge
my @centroid = map $_->centroid, @{$self->{clusters}};
# assign all data to the new centroids
$is_still_moving = 0;
my $i = 0;
for my $p (@{$self->{data}}) {
my $c;
{
my $min_dist;
for my $j (0 .. $#{$self->{clusters}}) {
my $dist = _dist($p, $centroid[$j]);
if(!defined $min_dist or $dist < $min_dist) {
$c = $j;
$min_dist = $dist;
}
}
}
if($c != $self->{cluster}[$i]) {
$self->{clusters}[$self->{cluster}[$i]]->invalidate;
$self->{clusters}[$c]->invalidate;
$self->{cluster}[$i] = $c;
$is_still_moving = 1;
}
} continue {
$i++;
}
}
}
}
}
sub clusters {
my $self = shift;
return @{$self->{clusters}};
}
sub _dist {
# function
return sqrt(sum map { my $d = $_[0][$_]-$_[1][$_]; $d*$d } 0 .. $#{$_[0]});
}
package Data::Cluster::kMean::Cluster;
use List::Util qw(sum);
sub new {
my $class = shift;
my($parent, $index) = @_;
bless { index => $index, data => $parent->{data}, cluster => $parent->{cluster}, centroid => undef}, $class;
}
sub points {
my $self = shift;
my $index = $self->{index};
my @point = @{$self->{data}}[grep $self->{cluster}[$_] == $index, 0 .. $#{$self->{data}}];
return @point;
}
sub centroid {
my $self = shift;
return $self->{centroid} ||= _centroid($self->points);
}
sub _centroid {
# function
return undef unless @_;
my $dim = @{$_[0]};
return [ map { my $i = $_; sum(map $_->[$i], @_) / @_ } 0 .. $dim-1 ];
}
sub invalidate {
# throw away cache
my $self = shift;
undef $self->{centroid};
}
sub index {
my $self = shift;
return $self->{index};
}
1;
#!/usr/bin/perl -w
# k Means demo program
# Adapted from a VB program by Kardi Teknomo
# http://people.revoledu.com/kardi/tutorial/kMean/index.html
# Ported to Perl/Tk by bart @ Perlmonks
use strict;
# ----------------------- Tk interface -------------------------
use Tk;
my $mw = MainWindow->new( -title => "k Means Clustering, adapted from tutorial by Kardi Teknomo" );
my $topframe = $mw->Frame()->pack();
my $butframe = $mw->Frame()->pack();
my $mainframe = $mw->Frame()->pack(-fill => 'both', -expand=> 1);
$topframe->Label(
-text => "Click data in the canvas below. The program will automatically cluster the data by color code."
)->pack();
$butframe->Label(-text => "Number of clusters")->pack(-side =>'left',-padx => 5);
my $clusters_entry = $butframe->Entry(-relief => "sunken", -disabledforeground => 'darkgray');
$clusters_entry->pack(-side=>'left');
$clusters_entry->insert('end', '3');
my $button_reset = $butframe->Button(-text => "Clear Data",
-relief => "raised",
-command => \&reset_click)->pack(-side=>'left', -padx=>10);
$butframe->Label(-text => "(X, Y)",
-justify => 'right')->pack(-side=>'left', -padx=>10);
my $label_xy = $butframe->Label( -text => "X, Y")
->pack(-side=>'left', -padx=>10);
my $canvas = $mainframe->Scrolled('Canvas', -scrollbars => '',
-background => 'white')->pack(-fill=>'both',-expand=>1);
$canvas->CanvasBind( "", [ \&canvas_click, Ev('x'), Ev('y')]);
$canvas->CanvasBind( "", [ \&canvas_mousemove, Ev('x'), Ev('y')]);
$canvas->CanvasBind( "", [ \&canvas_mouseleave ]);
#----------------------- Event Handlers ------------------------
my @color = qw(red yellow green cyan blue purple gray magenta pink
chartreuse coral darkolivegreen);
# If you want to be able to have more clusters, add more colours.
my(%point, @cluster, $dataset);
sub reset_click {
$canvas->delete('all');
(@cluster, %point, $dataset) = ();
$clusters_entry->configure(-state => 'normal');
}
sub canvas_mousemove {
my $canvas = shift;
my($x, $y) = @_;
my $x1 = sprintf('%.3d', $x);
my $y1 = sprintf('%.3d', $y);
$label_xy->configure(-text => "($x1,$y1)");
}
sub canvas_mouseleave {
my $canvas = shift;
my $x1 = sprintf('%.3d', 0);
my $y1 = sprintf('%.3d', 0);
$label_xy->configure(-text => "($x1,$y1)");
}
sub canvas_click {
my $canvas = shift;
my($x, $y) = @_;
if(!$dataset) {
my $clusters = $clusters_entry->get;
if($clusters !~ /^\d+$/ or $clusters == 0 or $clusters > @color) {
warn "Not a valid value for cluster count";
return;
}
$dataset = Data::Cluster::kMean->new(0+$clusters) or die "Failed to make object";
$clusters_entry->configure(-state => 'disabled');
}
$dataset->add(my $point = [ $x, $y ]);
# A point is an array ref with coordinates
my %record = ( data => $point, cluster => -1 );
$record{id} = $canvas->createLine($x, $y, $x, $y,
-fill => 'red', -width => 8,
-capstyle => 'round', -tags => ['dot'],
);
# Keep track of point properties using a stringified reference to the point coordinates array
$point{$point} = \%record;
foreach my $cluster ($dataset->clusters) {
my $i = $cluster->index;
my $r = $cluster[$i] ||= { obj => $cluster, id =>
$canvas->createText($x, $y, -anchor => 'c', -width => 150, tag => 'label',
-text => 1+$i) };
# Move centroid label
my($x, $y) = @{$cluster->centroid};
$canvas->coords($r->{id}, $x, $y);
# Colour dots according to cluster
foreach my $p ($cluster->points) {
if($point{$p}{cluster} != $i) {
$point{$p}{cluster} = $i;
$canvas->itemconfigure($point{$p}{id}, -fill => $color[$i]);
}
}
}
$canvas->raise('label', 'dot');
}
#------------------------ Main Program -------------------------
MainLoop;
#------------------------ kMean module -------------------------
package Data::Cluster::kMean;
use List::Util qw(sum);
sub new {
my $class = shift;
my($max_clusters) = @_;
return bless { max_clusters => $max_clusters, data => [], cluster => [], clusters => [] }, $class;
}
sub add {
# add data point(s) (array references) -- by reference, so make sure they're not reused for something else
my $self = shift;
return unless @_;
unless(ref $_[0] eq 'ARRAY') {
@_ = [ @_ ];
}
foreach my $p (@_) {
push @{$self->{data}}, $p;
push @{$self->{cluster}}, -1; # not in a cluster
if(@{$self->{clusters}} < $self->{max_clusters}) {
my $index = @{$self->{clusters}};
push @{$self->{clusters}}, Data::Cluster::kMean::Cluster->new($self, $index);
$self->{cluster}[-1] = $index;
} else {
my $c;
{
my $j = 0;
my $min_dist;
for my $cluster (@{$self->{clusters}}) {
my $dist = _dist($p, $cluster->centroid);
if(!defined $min_dist or $dist < $min_dist) {
$c = $j;
$min_dist = $dist;
}
} continue {
$j++;
}
}
$self->{clusters}[$c]->invalidate;
$self->{cluster}[-1] = $c;
my $is_still_moving = 1;
while($is_still_moving) {
# this loop will surely converge
my @centroid = map $_->centroid, @{$self->{clusters}};
# assign all data to the new centroids
$is_still_moving = 0;
my $i = 0;
for my $p (@{$self->{data}}) {
my $c;
{
my $min_dist;
for my $j (0 .. $#{$self->{clusters}}) {
my $dist = _dist($p, $centroid[$j]);
if(!defined $min_dist or $dist < $min_dist) {
$c = $j;
$min_dist = $dist;
}
}
}
if($c != $self->{cluster}[$i]) {
$self->{clusters}[$self->{cluster}[$i]]->invalidate;
$self->{clusters}[$c]->invalidate;
$self->{cluster}[$i] = $c;
$is_still_moving = 1;
}
} continue {
$i++;
}
}
}
}
}
sub clusters {
# Returns a list of all Cluster objects
my $self = shift;
return @{$self->{clusters}};
}
sub _dist {
# function
return sqrt(sum map { my $d = $_[0][$_]-$_[1][$_]; $d*$d } 0 .. $#{$_[0]});
}
package Data::Cluster::kMean::Cluster;
use List::Util qw(sum);
sub new {
my $class = shift;
my($parent, $index) = @_;
bless { index => $index, data => $parent->{data}, cluster => $parent->{cluster}, centroid => undef}, $class;
}
sub points {
# Returns a list of all points in cluster
my $self = shift;
my $index = $self->{index};
my @point = @{$self->{data}}[grep $self->{cluster}[$_] == $index, 0 .. $#{$self->{data}}];
return @point;
}
sub centroid {
# Returns a point indicating the cluster's center of gravity
my $self = shift;
return $self->{centroid} ||= _centroid($self->points);
}
sub _centroid {
# function
return undef unless @_;
my $dim = @{$_[0]};
return [ map { my $i = $_; sum(map $_->[$i], @_) / @_ } 0 .. $dim-1 ];
}
sub invalidate {
# Throw away cache
my $self = shift;
undef $self->{centroid};
}
sub index {
# integer, position in cluster array of parent
my $self = shift;
return $self->{index};
}
1;
But your entry text box, for the number of clusters, now looks very big, doesn't it? And you resorted to use a fixed width for the coordinates, 3 digits per coordinate, or else the widgets would jump to the left and to the right.
I really really dispise layout managers, Java's AWT is another one like that.
The sprintf on the digits was the easiest way out, but you could have set them in there own frame, set an alignment on the label packing, and give a -width to the label, to hold it steady.
Of course in a GUI app, half your time is spent making it appear and resize correctly, and pack makes that easier in the long run.
Bad option `-disabledforeground' at c:/Perl/site/lib/Tk/Widget.pm line 196. at kmeans_clustering.pl line 24I'm sure it's a version thing, this is before:
ppm> query tk Querying target 1 (ActivePerl 5.8.0.806) 1. Tk [800.024] A Graphical User Interface ToolkitIn the process of upgrading, but thought you'd like to know...
-QM
--
Quantum Mechanics: The dreams stuff is made of
Nice stuff! I'll also point out the Algorithm::Cluster library on CPAN, which not only gives versions of kmeans, but also a number of other clustering/unsupervised pat-recognition algorithms.
One nice thing about that module is that it provides a simple way of iterating the initial randomization and repeating the clustering so you can have a bit more confidence that you're getting closer to a global separation instead of a local one. Nevertheless, really nice stuff!
perlmonks.org content © perlmonks.org and bart, bernanke01, QM, zentara
prlmnks.org © 2006 edmund von der burg (eccles & toad)
v 0.03