table of contents
Genome::Model::Tools::Music::Bmr::CalcBmr(3pm) | User Contributed Perl Documentation | Genome::Model::Tools::Music::Bmr::CalcBmr(3pm) |
EOS
); }
sub _doc_authors {
return " Cyriac Kandoth, Ph.D."; }
sub _doc_see_also {
return <<EOS genome-music-bmr(1), genome-music(1),
genome(1) EOS }
sub execute {
my $self = shift;
my $roi_file =
$self->roi_file;
my $ref_seq =
$self->reference_sequence;
my $bam_list =
$self->bam_list;
my $output_dir =
$self->output_dir;
my $maf_file =
$self->maf_file;
my $show_skipped =
$self->show_skipped;
my $bmr_groups =
$self->bmr_groups;
my $separate_truncations =
$self->separate_truncations;
my $merge_concurrent_muts =
$self->merge_concurrent_muts;
my $genes_to_ignore =
$self->genes_to_ignore;
my $skip_non_coding =
$self->skip_non_coding;
my $skip_silent =
$self->skip_silent;
# Check on all the input data before starting work print STDERR "ROI file not found or is empty: $roi_file\n" unless( -s $roi_file ); print STDERR "Reference sequence file not found: $ref_seq\n" unless( -e $ref_seq ); print STDERR "List of BAMs not found or is empty: $bam_list\n" unless( -s $bam_list ); print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir ); print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file ); return undef unless( -s $roi_file && -e $ref_seq && -s $bam_list && -e $output_dir && -s $maf_file ); # Check on the files we expect to find within the provided output directory $output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any my $gene_covg_dir = "$output_dir/gene_covgs"; # Should contain per-gene coverage files per sample my $total_covgs_file = "$output_dir/total_covgs"; # Should contain overall coverages per sample print STDERR "Directory with per-gene coverages not found: $gene_covg_dir\n" unless( -e $gene_covg_dir ); print STDERR "Total coverages file not found or is empty: $total_covgs_file\n" unless( -s $total_covgs_file ); return undef unless( -e $gene_covg_dir && -s $total_covgs_file ); # Outputs of this script will be written to these locations in the output directory my $overall_bmr_file = "$output_dir/overall_bmrs"; my $gene_mr_file = "$output_dir/gene_mrs"; $self->gene_mr_file( $gene_mr_file ); # Build a hash to quickly lookup the genes to be ignored for overall BMRs my %ignored_genes = (); if( defined $genes_to_ignore ) { %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore ); } # Parse out the names of the samples which should match the names of the coverage files needed my ( @all_sample_names, %sample_idx ); my $idx = 0; my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); $sample_idx{$sample} = $idx++; } $sampleFh->close; # If the reference sequence FASTA file hasn't been indexed, do it my $ref_seq_idx = "$ref_seq.fai"; Genome::Sys->shellcmd( cmd => "samtools faidx $ref_seq" ) unless( -e $ref_seq_idx ); # Parse gene names and ROIs. Mutations outside these ROIs will be skipped my ( @all_gene_names, %gene_idx ); $idx = 0; my $roi_bitmask = $self->create_empty_genome_bitmask( $ref_seq_idx ); my $roiFh = IO::File->new( $roi_file ) or die "Couldn't open $roi_file. $!"; while( my $line = $roiFh->getline ) { next if( $line =~ m/^#/ ); chomp $line; my ( $chr, $start, $stop, $gene ) = split( /\t/, $line ); if( !$roi_bitmask->{$chr} or $start > $roi_bitmask->{$chr}->Size ) { print STDERR "Skipping invalid ROI bitmask $chr:$start-$stop\n"; next; } $roi_bitmask->{$chr}->Interval_Fill( $start, $stop ); unless( defined $gene_idx{$gene} ) { push( @all_gene_names, $gene ); $gene_idx{$gene} = $idx++; } } $roiFh->close; # These are the various categories that each mutation will be classified into my @mut_classes = ( AT_Transitions, AT_Transversions, CG_Transitions, CG_Transversions, CpG_Transitions, CpG_Transversions, Indels ); push( @mut_classes, Truncations ) if( $separate_truncations ); # Save the actual class names for reporting purposes, because the elements above are really just numerical constants my @mut_class_names = qw( AT_Transitions AT_Transversions CG_Transitions CG_Transversions CpG_Transitions CpG_Transversions Indels ); push( @mut_class_names, 'Truncations' ) if( $separate_truncations ); my @sample_mr; # Stores per sample covg and mutation information foreach my $sample ( @all_sample_names ) { $sample_mr[$sample_idx{$sample}][$_][mutations] = 0 foreach( @mut_classes ); $sample_mr[$sample_idx{$sample}][$_][covd_bases] = 0 foreach( @mut_classes ); } # Load the covered base-counts per sample from the output of "music bmr calc-covg" print STDERR "Loading per-sample coverages stored in $total_covgs_file\n"; my $sample_cnt_in_total_covgs_file = 0; my $totCovgFh = IO::File->new( $total_covgs_file ) or die "Couldn't open $total_covgs_file. $!"; while( my $line = $totCovgFh->getline ) { next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ ); chomp( $line ); ++$sample_cnt_in_total_covgs_file; my ( $sample, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line ); $sample_mr[$sample_idx{$sample}][AT_Transitions][covd_bases] = $covd_at_bases; $sample_mr[$sample_idx{$sample}][AT_Transversions][covd_bases] = $covd_at_bases; $sample_mr[$sample_idx{$sample}][CG_Transitions][covd_bases] = $covd_cg_bases; $sample_mr[$sample_idx{$sample}][CG_Transversions][covd_bases] = $covd_cg_bases; $sample_mr[$sample_idx{$sample}][CpG_Transitions][covd_bases] = $covd_cpg_bases; $sample_mr[$sample_idx{$sample}][CpG_Transversions][covd_bases] = $covd_cpg_bases; $sample_mr[$sample_idx{$sample}][Indels][covd_bases] = $covd_bases; $sample_mr[$sample_idx{$sample}][Truncations][covd_bases] = $covd_bases if( $separate_truncations ); } $totCovgFh->close; unless( $sample_cnt_in_total_covgs_file == scalar( @all_sample_names )) { print STDERR "Mismatching number of samples in $total_covgs_file and $bam_list\n"; return undef; } my @gene_mr; # Stores per gene covg and mutation information foreach my $gene ( @all_gene_names ) { foreach my $sample ( @all_sample_names ) { $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] = 0 foreach( @mut_classes ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][covd_bases] = 0 foreach( @mut_classes ); } } # Sum up the per-gene covered base-counts across samples from the output of "music bmr calc-covg" print STDERR "Loading per-gene coverage files stored under $gene_covg_dir/\n"; foreach my $sample ( @all_sample_names ) { my $sample_covg_file = "$gene_covg_dir/$sample.covg"; my $sampleCovgFh = IO::File->new( $sample_covg_file ) or die "Couldn't open $sample_covg_file. $!"; while( my $line = $sampleCovgFh->getline ) { next unless( $line =~ m/^\S+\t\d+\t\d+\t\d+\t\d+\t\d+$/ and $line !~ m/^#/ ); chomp( $line ); my ( $gene, undef, $covd_bases, $covd_at_bases, $covd_cg_bases, $covd_cpg_bases ) = split( /\t/, $line ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transitions][covd_bases] += $covd_at_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][AT_Transversions][covd_bases] += $covd_at_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transitions][covd_bases] += $covd_cg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CG_Transversions][covd_bases] += $covd_cg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transitions][covd_bases] += $covd_cpg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][CpG_Transversions][covd_bases] += $covd_cpg_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Indels][covd_bases] += $covd_bases; $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][Truncations][covd_bases] += $covd_bases if( $separate_truncations ); } $sampleCovgFh->close; } # Run "joinx ref-stats" to classify SNVs as being at AT, CG, or CpG sites in the reference print STDERR "Running 'joinx ref-stats' to read reference FASTA and identify SNVs at AT, CG, CpG sites\n"; my $maf_bed = Genome::Sys->create_temp_file_path(); my $mafBedFh = IO::File->new( $maf_bed, ">" ) or die "Temporary file could not be created. $!"; my $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!"; while( my $line = $mafFh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $chr, $start, $stop, $mutation_type, $ref, $var1, $var2 ) = @cols[4..6,9..12]; if( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) { $mafBedFh->print( "$chr\t" . ( $start - 2 ) . "\t" . ( $start + 1 ) . "\n" ); } } $mafFh->close; $mafBedFh->close; my $refstats_file = Genome::Sys->create_temp_file_path(); Genome::Sys->shellcmd( cmd => "joinx ref-stats --ref-bases --bed $maf_bed --fasta $ref_seq --output $refstats_file" ); # Parse through the ref-stats output and load it into hashes for quick lookup later my ( %ref_base, %cpg_site ); my $refStatsFh = IO::File->new( $refstats_file ) or die "Couldn't open $refstats_file. $!"; while( my $line = $refStatsFh->getline ) { next if( $line =~ m/^#/ ); chomp $line; my ( $chr, undef, $pos, undef, undef, undef, $ref ) = split( /\t/, $line ); my $locus = "$chr\t" . ( $pos - 1 ); $ref_base{$locus} = substr( $ref, 1, 1 ); $cpg_site{$locus} = 1 if( $ref =~ m/CG/ ); } $refStatsFh->close; # Create a hash to help classify SNVs my %classify; $classify{$_} = AT_Transitions foreach( qw( AG TC )); $classify{$_} = AT_Transversions foreach( qw( AC AT TA TG )); $classify{$_} = CG_Transitions foreach( qw( CT GA )); $classify{$_} = CG_Transversions foreach( qw( CA CG GC GT )); # Parse through the MAF file and categorize each somatic mutation print STDERR "Parsing MAF file to classify mutations\n"; my %skip_cnts; $mafFh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!"; while( my $line = $mafFh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15]; # Skip mutations in samples that are not in the provided bam list unless( defined $sample_idx{$sample} ) { $skip_cnts{"belong to unrecognized samples"}++; print STDERR "Skipping unrecognized sample ($sample not in BAM list): $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # If the mutation classification is odd, quit with error if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { $skip_cnts{"are classified as $mutation_class"}++; print STDERR "Skipping $mutation_class mutation: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # If the mutation type is odd, quit with error if( $mutation_type !~ m/^(SNP|DNP|TNP|ONP|INS|DEL|Consolidated)$/ ) { print STDERR "Unrecognized Variant_Type \"$mutation_type\" in MAF file: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # Skip mutations that were consolidated into others (E.g. SNP consolidated into a TNP) if( $mutation_type =~ m/^Consolidated$/ ) { $skip_cnts{"are consolidated into another"}++; print STDERR "Skipping consolidated mutation: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # Skip mutations that fall completely outside any of the provided regions of interest if( $self->count_bits( $roi_bitmask->{$chr}, $start, $stop ) == 0 ) { $skip_cnts{"are outside any ROIs"}++; print STDERR "Skipping mutation that falls outside ROIs: $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } # Skip mutations whose gene names don't match any of those in the ROI list unless( defined $gene_idx{$gene} ) { $skip_cnts{"have unrecognized gene names"}++; print STDERR "Skipping unrecognized gene name (not in ROI file): $gene, $chr:$start-$stop\n" if( $show_skipped ); next; } my $class = ''; # Check if the mutation is the truncating type, if the user wanted a separate category of those if( $separate_truncations && $mutation_class =~ m/^(Nonsense_Mutation|Splice_Site|Frame_Shift_Del|Frame_Shift_Ins)/ ) { $class = Truncations; } # Classify the mutation as AT/CG/CpG Transition, AT/CG/CpG Transversion elsif( $mutation_type =~ m/^(SNP|DNP|ONP|TNP)$/ ) { # ::TBD:: For DNPs and TNPs, we use only the first base for mutation classification $ref = substr( $ref, 0, 1 ); $var1 = substr( $var1, 0, 1 ); $var2 = substr( $var2, 0, 1 ); # If the alleles are anything but A, C, G, or T then quit with error if( $ref !~ m/[ACGT]/ || $var1 !~ m/[ACGT]/ || $var2 !~ m/[ACGT]/ ) { print STDERR "Unrecognized allele in column Reference_Allele, Tumor_Seq_Allele1, or Tumor_Seq_Allele2: $gene, $chr:$start-$stop\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # Use the classify hash to find whether this SNV is an AT/CG Transition/Transversion $class = $classify{ "$ref$var1" } if( defined $classify{ "$ref$var1" } ); $class = $classify{ "$ref$var2" } if( defined $classify{ "$ref$var2" } ); # Check if the ref base in the MAF matched what we fetched from the ref-seq my $locus = "$chr\t$start"; if( defined $ref_base{$locus} && $ref_base{$locus} ne $ref ) { print STDERR "Reference allele $ref for $gene variant at $chr:$start-$stop is " . $ref_base{$locus} . " in the FASTA. Using it anyway.\n"; } # Check if a C or G reference allele belongs to a CpG pair in the refseq if(( $ref eq 'C' || $ref eq 'G' ) && defined $cpg_site{$locus} ) { $class = (( $class == CG_Transitions ) ? CpG_Transitions : CpG_Transversions ); } } # Classify it as an indel (excludes splice-site and frame-shift if user wanted truncations separately) elsif( $mutation_type =~ m/^(INS|DEL)$/ ) { $class = Indels; } # The user's gene exclusion list affects only the overall BMR calculations $sample_mr[$sample_idx{$sample}][$class][mutations]++ unless( defined $ignored_genes{$gene} ); $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]++; } $mafFh->close; # Display statistics related to parsing the MAF print STDERR "Finished Parsing the MAF file to classify mutations\n"; foreach my $skip_type ( sort {$skip_cnts{$b} <=> $skip_cnts{$a}} keys %skip_cnts ) { print STDERR "Skipped " . $skip_cnts{$skip_type} . " mutation(s) that $skip_type\n" if( defined $skip_cnts{$skip_type} ); } # If the user wants, merge together concurrent mutations of a gene in the same sample if( $merge_concurrent_muts ) { foreach my $sample ( @all_sample_names ) { foreach my $gene ( @all_gene_names ) { next unless( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ); my $num_muts = 0; $num_muts += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$_][mutations] foreach( @mut_classes ); if( $num_muts > 1 ) { foreach my $class ( @mut_classes ) { my $muts_in_class = $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; # Num of muts of gene in this class $sample_mr[$sample_idx{$sample}][$class][mutations] -= $muts_in_class; # Take it out of the sample total $muts_in_class /= $num_muts; # Turn it into a fraction of the total number of muts in this gene $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations] = $muts_in_class; # Use the fraction as the num muts of gene in this class $sample_mr[$sample_idx{$sample}][$class][mutations] += $muts_in_class; # Add the same fraction to the sample total } } } } } # Calculate per-sample BMRs, and also subtract out covered bases in genes the user wants ignored foreach my $sample ( @all_sample_names ) { my $tot_muts = 0; foreach my $class ( @mut_classes ) { # Subtract the covered bases in this class that belong to the genes to be ignored # ::TBD:: Some of these bases may also belong to another gene (on the other strand maybe?), and those should not be subtracted foreach my $ignored_gene ( keys %ignored_genes ) { $sample_mr[$sample_idx{$sample}][$class][covd_bases] -= $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}][$class][covd_bases] if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$ignored_gene}] ); } $tot_muts += $sample_mr[$sample_idx{$sample}][$class][mutations]; } $sample_mr[$sample_idx{$sample}][Overall][bmr] = $tot_muts / $sample_mr[$sample_idx{$sample}][Indels][covd_bases]; } # Cluster samples into bmr-groups using k-means clustering my @sample_bmrs = map { $sample_mr[$sample_idx{$_}][Overall][bmr] } @all_sample_names; my @bmr_clusters = k_means( $bmr_groups, \@sample_bmrs ); # Calculate overall BMRs for each cluster of samples, and print them to file my %cluster_bmr; # Stores per cluster categorized BMR my $totBmrFh = IO::File->new( $overall_bmr_file, ">" ) or die "Couldn't open $overall_bmr_file. $!"; $totBmrFh->print( "#User-specified genes skipped in these calculations: $genes_to_ignore\n" ) if( defined $genes_to_ignore ); my ( $covered_bases_sum, $mutations_sum ) = ( 0, 0 ); for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) { my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]}; unless( $bmr_groups == 1 ) { $totBmrFh->print( "#BMR sub-group ", $i + 1, " (", scalar( @{$bmr_clusters[$i]} ), " samples)\n" ); $totBmrFh->print( "#Samples: ", join( ",", @samples_in_cluster ), "\n" ); } $totBmrFh->print( "#Mutation_Class\tCovered_Bases\tMutations\tOverall_BMR\n" ); my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 ); foreach my $class ( @mut_classes ) { my ( $covd_bases, $mutations ) = ( 0, 0 ); foreach my $sample ( @samples_in_cluster ) { $covd_bases += $sample_mr[$sample_idx{$sample}][$class][covd_bases]; $mutations += $sample_mr[$sample_idx{$sample}][$class][mutations]; } $tot_covd_bases = $covd_bases if( $class == Indels ); # Save this to calculate overall BMR below # Calculate overall BMR for this mutation class and print it to file $cluster_bmr{$i}[$class][bmr] = ( $covd_bases == 0 ? 0 : ( $mutations / $covd_bases )); $totBmrFh->print( join( "\t", $mut_class_names[$class], $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" ); $tot_muts += $mutations; } $totBmrFh->print( join( "\t", "Overall_BMR", $tot_covd_bases, $tot_muts, $tot_muts / $tot_covd_bases ), "\n\n" ); $covered_bases_sum += $tot_covd_bases; $mutations_sum += $tot_muts; } $totBmrFh->close; $self->bmr_output( $mutations_sum / $covered_bases_sum ); # Print out a file containing per-gene mutation counts and covered bases for use by "music smg" my $geneBmrFh = IO::File->new( $gene_mr_file, ">" ) or die "Couldn't open $gene_mr_file. $!"; $geneBmrFh->print( "#Gene\tMutation_Class\tCovered_Bases\tMutations\tBMR\n" ); foreach my $gene ( sort @all_gene_names ) { my ( $tot_covd_bases, $tot_muts ) = ( 0, 0 ); for( my $i = 0; $i < scalar( @bmr_clusters ); ++$i ) { my @samples_in_cluster = map { $all_sample_names[$_] } @{$bmr_clusters[$i]}; foreach my $class ( @mut_classes ) { my ( $covd_bases, $mutations ) = ( 0, 0 ); foreach my $sample( @samples_in_cluster ) { if( defined $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}] ) { $covd_bases += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][covd_bases]; $mutations += $gene_mr[$sample_idx{$sample}][$gene_idx{$gene}][$class][mutations]; } } my $rename_class = $mut_class_names[$class]; $rename_class = ( $rename_class . "_SubGroup" . ( $i + 1 )) if( $bmr_groups > 1 ); $geneBmrFh->print( join( "\t", $gene, $rename_class, $covd_bases, $mutations, $cluster_bmr{$i}[$class][bmr] ), "\n" ); $tot_muts += $mutations; $tot_covd_bases += $covd_bases if( $class == Indels ); } } $geneBmrFh->print( join( "\t", $gene, "Overall", $tot_covd_bases, $tot_muts, $self->bmr_output ), "\n" ); } $geneBmrFh->close; return 1; }
# Creates an empty whole genome bitmask based on the given
reference sequence index sub create_empty_genome_bitmask {
my ( $self, $ref_seq_idx_file
) = @_;
my %genome;
my $refFh = IO::File->new(
$ref_seq_idx_file ) or die "Couldn't open
$ref_seq_idx_file. $!";
while( my $line =
$refFh->getline ) {
my ( $chr, $length ) = split(
/\t/, $line );
$genome{$chr} = Bit::Vector->new(
$length + 1 ); # Adding a base for 1-based
coordinates
}
$refFh->close;
return \%genome; }
# Counts the number of bits that are set in the given region of a
Bit:Vector sub count_bits {
my ( $self, $vector,
$start, $stop ) =
@_;
my $count = 0;
for my $pos ( $start..$stop )
{
++$count if( $vector->bit_test(
$pos ));
}
return $count; }
# Given a list of numerical values, returns k clusters based on
k-means clustering sub k_means {
my ( $k, $list_ref ) =
@_;
my @vals = @{$list_ref};
my $num_vals = scalar( @vals
);
# Start with the first k values as the centroids my @centroids = @vals[0..($k-1)]; my @prev_centroids = map { 0 } @centroids; my @groups = (); my $diff_means = 1; # Arbitrary non-zero value # Repeat until the difference between these centroids and the previous ones, converges to zero while( $diff_means > 0 ) { @groups = (); # Group values into clusters based on closest centroid for( my $i = 0; $i < $num_vals; ++$i ) { my @distances = map { abs( $vals[$i] - $_ ) } @centroids; my $closest = min( @distances ); for( my $j = 0; $j < $k; ++$j ) { if( $distances[$j] == $closest ) { push( @{$groups[$j]}, $i ); last; } } } # Calculate means to be the new centroids, and the sum of differences $diff_means = 0; for( my $i = 0; $i < $k; ++$i ) { $centroids[$i] = sum( map {$vals[$_]} @{$groups[$i]} ); $centroids[$i] /= scalar( @{$groups[$i]} ); $diff_means += abs( $centroids[$i] - $prev_centroids[$i] ); } # Save the current centroids for comparisons with those in the next iteration @prev_centroids = @centroids; } return @groups; }
1;
2020-11-06 | perl v5.30.3 |