table of contents
Genome::Model::Tools::Music::PathScan(3pm) | User Contributed Perl Documentation | Genome::Model::Tools::Music::PathScan(3pm) |
- This is a tab-delimited file prepared from a pathway database (such as KEGG), with the columns: [path_id, path_name, class, gene_line, diseases, drugs, description] The latter three columns are optional (but are available on KEGG). The gene_line contains the "entrez_id:gene_name" of all genes involved in this pathway, each separated by a "|" symbol.
- For example, a line in the pathway-file would look like:
hsa00061 Fatty acid biosynthesis Lipid Metabolism 31:ACACA|32:ACACB|27349:MCAT|2194:FASN|54995:OXSM|55301:OLAH
Ensure that the gene names and entrez IDs used match those used in the MAF file. Entrez IDs are not mandatory (use a 0 if Entrez ID unknown). But if a gene name in the MAF does not match any gene name in this file, the entrez IDs are used to find a match (unless it's a 0).
EOS
); }
sub _doc_authors {
return <<EOS
Michael Wendl, Ph.D. EOS }
sub _doc_credits {
return <<EOS This module uses reformatted copies of data from the Kyoto
Encyclopedia of Genes and Genomes (KEGG) database:
* KEGG - http://www.genome.jp/kegg/ EOS }
sub execute {
my $self = shift;
my $covg_dir =
$self->gene_covg_dir;
my $bam_list =
$self->bam_list;
my $pathway_file =
$self->pathway_file;
my $maf_file =
$self->maf_file;
my $output_file =
$self->output_file;
my $bgd_mut_rate =
$self->bmr;
my $genes_to_ignore =
$self->genes_to_ignore;
my $min_mut_genes_per_path =
$self->min_mut_genes_per_path;
my $skip_non_coding =
$self->skip_non_coding;
my $skip_silent =
$self->skip_silent;
# Check on all the input data before starting work print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file ); print STDERR "Directory with gene coverages not found: $covg_dir\n" unless( -e $covg_dir ); print STDERR "List of samples not found or is empty: $bam_list\n" unless( -s $bam_list ); print STDERR "Pathway info file not found or is empty: $pathway_file\n" unless( -s $pathway_file ); return undef unless( -s $maf_file && -e $covg_dir && -s $bam_list && -s $pathway_file ); # Build a hash to quickly lookup the genes whose mutations should be ignored my %ignored_genes = (); if( defined $genes_to_ignore ) { %ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore ); } # PathScan uses a helluva lot of hashes - all your RAM are belong to it my %sample_gene_hash; # sample => array of genes (based on maf) my %gene_path_hash; # gene => array of pathways (based on path_file) my %path_hash; # pathway => all the information about the pathways in the database my %sample_path_hash; # sample => pathways (based on %sample_gene_hash and %gene_path_hash) my %path_sample_hits_hash; # path => sample => hits,mutated_genes my %gene_sample_cov_hash; # gene => sample => coverage my @all_sample_names; # names of all the samples, no matter if it's mutated or not my %id_gene_hash; # entrez id => gene (based on first two columns in MAF) # Parse out the names of the samples which should match the names in the MAF file my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); } $sampleFh->close; # Read coverage data calculated by the Music::Bmr::CalcCovg $covg_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any read_CoverageFiles( $covg_dir, \@all_sample_names, \%gene_sample_cov_hash ); #build gene => average_coverage hash for population test my %gene_cov_hash; foreach my $gene ( keys %gene_sample_cov_hash ) { my $total_cov = 0; my $sample_num = scalar( @all_sample_names ); $total_cov += $gene_sample_cov_hash{$gene}{$_} foreach( @all_sample_names ); $gene_cov_hash{$gene} = int( $total_cov / $sample_num ); } #build %sample_gene_hash based on maf my $maf_fh = IO::File->new( $maf_file ); while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp( $line ); my @cols = split( /\t/, $line ); my ( $gene, $entrez_id, $mutation_class, $tumor_sample ) = ( $cols[0], $cols[1], $cols[8], $cols[15] ); # If the mutation classification is odd, quit with error if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { print STDERR "Unrecognized Variant_Classification $mutation_class in MAF file.\n"; print STDERR "Please use TCGA MAF Specification v2.3.\n"; return undef; } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $skip_silent && $mutation_class =~ m/^Silent$/ )) { print STDERR "Skipping $mutation_class mutation in gene $gene.\n"; next; } # Check that the user followed instructions and named each sample correctly unless( grep( /^$tumor_sample$/, @all_sample_names )) { print STDERR "Sample $tumor_sample in MAF file does not match any in $bam_list\n"; return undef; } next if( defined $ignored_genes{$gene} ); # Ignore variants in genes that user wants ignored $id_gene_hash{$entrez_id} = $gene unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ ); push( @{$sample_gene_hash{$tumor_sample}}, $gene ) unless( grep /^$gene$/, @{$sample_gene_hash{$tumor_sample}} ); } $maf_fh->close; my $path_fh = IO::File->new( $pathway_file ); while( my $line = $path_fh->getline ) { chomp( $line ); next if( $line =~ /^(#|ID)/ ); #Skip headers my ( $path_id, $name, $class, $gene_line, $diseases, $drugs, $description ) = split( /\t/, $line ); my @genes = split( /\|/, $gene_line ); #Each gene is in the format "EntrezID:GeneSymbol" $diseases =~ s/\|/, /g; #Change the separators to commas $drugs =~ s/\|/, /g; #Change the separators to commas $path_hash{$path_id}{name} = $name unless( $name eq '' ); $path_hash{$path_id}{class} = $class unless( $class eq '' ); $path_hash{$path_id}{diseases} = $diseases unless( $diseases eq '' ); $path_hash{$path_id}{drugs} = $drugs unless( $drugs eq '' ); $path_hash{$path_id}{description} = $description unless( $description eq '' ); @{$path_hash{$path_id}{gene}} = (); foreach my $gene ( @genes ) { my ( $entrez_id, $gene_symbol ) = split( /:/, $gene ); unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ ) { # Use the gene name from the MAF file if the entrez ID matches $gene_symbol = $id_gene_hash{$entrez_id} if( defined $id_gene_hash{$entrez_id} ); } push( @{$gene_path_hash{$gene_symbol}}, $path_id ) unless( grep /^$path_id$/, @{$gene_path_hash{$gene_symbol}} ); unless( grep /^$gene_symbol$/, @{$path_hash{$path_id}{gene}} ) { push( @{$path_hash{$path_id}{gene}}, $gene_symbol ); } } } $path_fh->close; #build a sample => pathway hash foreach my $sample ( keys %sample_gene_hash ) { foreach my $gene ( @{$sample_gene_hash{$sample}} ) { if( defined $gene_path_hash{$gene} ) { foreach my $pathway ( @{$gene_path_hash{$gene}} ) { push( @{$sample_path_hash{$sample}}, $pathway ) unless( grep /^$pathway$/, @{$sample_path_hash{$sample}} ); } } } } #build path_sample_hits_hash, for population test foreach my $sample ( keys %sample_path_hash ) { foreach my $path ( @{$sample_path_hash{$sample}} ) { my $hits = 0; my @mutated_genes = (); #Mutated genes in this sample belonging to this pathway my @mutated_genes_in_sample = @{$sample_gene_hash{$sample}}; foreach my $gene ( @{$path_hash{$path}{gene}} ) { if( grep /^$gene$/, @mutated_genes_in_sample ) #if this gene is mutated in this sample (in maf) { $hits++; push( @mutated_genes, $gene ); } } if( $hits > 0 ) { $path_sample_hits_hash{$path}{$sample}{hits} = $hits; $path_sample_hits_hash{$path}{$sample}{mutated_genes} = \@mutated_genes; } } } #Calculation of p value my %data; #For printing my @pvals; foreach my $path ( sort keys %path_hash ) { my @pathway_genes = @{$path_hash{$path}{gene}}; my @gene_sizes = (); foreach my $gene ( @pathway_genes ) { if( defined $gene_cov_hash{$gene} ) { my $avg_cov = int( $gene_cov_hash{$gene} ); push( @gene_sizes, $avg_cov ) if( $avg_cov > 3 ); } } #If this pathway doesn't have any gene coverage, skip it next unless( scalar( @gene_sizes ) > 0 ); my @num_hits_per_sample; #store hits info for each patient my @mutated_samples = sort keys %{$path_sample_hits_hash{$path}}; foreach my $sample ( @all_sample_names ) { my $hits = 0; #if this sample has mutation if( grep /^$sample$/, @mutated_samples ) { $hits = $path_sample_hits_hash{$path}{$sample}{hits}; } push( @num_hits_per_sample, $hits ); } #If this pathway doesn't have any mutated genes in any samples, skip it next unless( scalar( @num_hits_per_sample ) > 0 ); my $hits_ref = \@num_hits_per_sample; ########### MCW ADDED # FIND MAX NUMBER OF HITS IN A SAMPLE my $max_hits = 0; foreach my $hits_in_sample ( @num_hits_per_sample ) { $max_hits = $hits_in_sample if( $hits_in_sample > $max_hits ); } ########### MCW ADDED my $pop_obj = Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \@gene_sizes ); if( scalar( @gene_sizes ) >= 3 ) { ########### MCW ADDED if( $max_hits > 15 ) { $pop_obj->assign( 5 ); } else { $pop_obj->assign( 3 ); } ########### MCW ADDED #$pop_obj->assign(3); } elsif( @gene_sizes == 2 ) { $pop_obj->assign( 2 ); } else { $pop_obj->assign( 1 ); } $pop_obj->preprocess( $bgd_mut_rate, $hits_ref ); #mwendl's new fix my $pval = $pop_obj->population_pval_approx($hits_ref); $data{$pval}{$path}{samples} = \@mutated_samples; $data{$pval}{$path}{hits} = $hits_ref; push( @pvals, $pval ); # For calculation of FDR } # Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values my $pval_cnt = scalar( @pvals ); my %fdr_hash; for( my $i = 0; $i < $pval_cnt; $i++ ) { my $fdr = $pvals[$i] * $pval_cnt / ( $pval_cnt - $i ); $fdr = 1 if $fdr > 1; $fdr_hash{$pvals[$i]} = $fdr; } # Print two output files, one more detailed than the other my $out_fh = IO::File->new( $output_file, ">" ); my $out_detailed_fh = IO::File->new( "$output_file\_detailed", ">" ); $out_fh->print( "Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n" ); foreach my $pval ( sort { $a <=> $b } keys %data ) { foreach my $path ( sort keys %{$data{$pval}} ) { # Skip this pathway if it has fewer affected genes than the user wants my %mutated_gene_hash; my @samples = @{$data{$pval}{$path}{samples}}; foreach my $sample ( @samples ) { foreach my $gene ( @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}} ) { $mutated_gene_hash{$gene}++; } } next unless ( scalar( keys %mutated_gene_hash ) >= $min_mut_genes_per_path ); # Print detailed output to a separate output file $out_detailed_fh->print( "Pathway: $path\n" ); $out_detailed_fh->print( "Name: ", $path_hash{$path}{name}, "\n" ) if( defined $path_hash{$path}{name} ); $out_detailed_fh->print( "Class: ", $path_hash{$path}{class}, "\n" ) if( defined $path_hash{$path}{class} ); $out_detailed_fh->print( "Diseases: ", $path_hash{$path}{diseases}, "\n" ) if( defined $path_hash{$path}{diseases} ); $out_detailed_fh->print( "Drugs: ", $path_hash{$path}{drugs}, "\n" ) if( defined $path_hash{$path}{drugs} ); $out_detailed_fh->print( "P-value: $pval\n", "FDR: ", $fdr_hash{$pval}, "\n" ); $out_detailed_fh->print( "Description: ", $path_hash{$path}{description}, "\n" ); my @hits = @{$data{$pval}{$path}{hits}}; foreach my $sample ( @samples ) { my @mutated_genes = @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}}; $out_detailed_fh->print( "$sample:" ); $out_detailed_fh->print( join ",", @mutated_genes ); $out_detailed_fh->print( "\n" ); } my ( $mutSampleCnt, $totalMutGenes ) = ( 0, 0 ); $out_detailed_fh->print( "Samples with mutations (#hits): " ); for( my $i = 0; $i < scalar( @all_sample_names ); ++$i ) { if( $hits[$i] > 0 ) { $out_detailed_fh->print( "$all_sample_names[$i]($hits[$i]) " ); $mutSampleCnt++; $totalMutGenes += $hits[$i]; } } $out_detailed_fh->print( "\n\n" ); # Print tabulated output to the main output file my ( $path_name, $path_class ) = ( "-", "-" ); $path_name = $path_hash{$path}{name} if( defined $path_hash{$path}{name} ); $path_class = $path_hash{$path}{class} if( defined $path_hash{$path}{class} ); $out_fh->print( "$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t", "$pval\t", $fdr_hash{$pval}, "\n" ); } } $out_detailed_fh->close; $out_fh->close; return 1; }
# Reads files for each sample which are formatted as tab-separated
lines each showing the number of # bases with sufficient coverage in a gene.
sub read_CoverageFiles {
my ( $covg_dir,
$all_samples_ref,
$gene_sample_cov_hash_ref ) = (
$_[0], $_[1],
$_[2] );
# Read per-gene covered base counts for each sample foreach my $sample ( @{$all_samples_ref} ) { # If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete unless( -s "$covg_dir/$sample.covg" ) { print STDERR "Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n"; exit 1; } my $covgFh = IO::File->new( "$covg_dir/$sample.covg" ); while( my $line = $covgFh->getline ) { next if( $line =~ m/^#/ ); my ( $gene, undef, $covd_bases ) = split( /\t/, $line ); $gene_sample_cov_hash_ref->{$gene}{$sample} = $covd_bases; } $covgFh->close; } }
1;
2020-11-06 | perl v5.30.3 |