table of contents
Genome::Model::Tools::Music::Survival(3pm) | User Contributed Perl Documentation | Genome::Model::Tools::Music::Survival(3pm) |
- Headers are required.
- The first column of each clinical data file must contain sample IDs which match those in both the --bam-list and the MAF variant list (in the MAF, this is the Tumor_Sample_Barcode column, specifically).
- In at least one of the clinical data files input, columns with headers "vital_status" and "days_to_last_followup" (case insensitive) must exist. "vital_status" must be delineated by 1's and 0's, where 0 denotes 'living', and 1 denotes 'deceased'.
Note that all input files must be tab-separated.
HELP }
sub _additional_help_sections {
return (
"ARGUMENTS",
<<EOS
EOS
); }
sub _doc_authors {
return <<EOS
Nathan D. Dees, Ph.D.
Qunyuan Zhang, Ph.D. EOS }
sub execute {
# parse input arguments my $self = shift; my $bam_list = $self->bam_list; my $genetic_data_type = $self->genetic_data_type; my $legend_placement = $self->legend_placement; # handle phenotype inclusions my @phenotypes_to_include; my @clinical_phenotypes_to_include; my @mutated_genes_to_include; if ($self->phenotypes_to_include) { @phenotypes_to_include = split /,/,$self->phenotypes_to_include; } # check genetic data type unless ($genetic_data_type =~ /^gene|variant$/i) { $self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter."); return; } # load clinical data and analysis types my %clinical_data; if ($self->numeric_clinical_data_file) { $clinical_data{'numeric'} = $self->numeric_clinical_data_file; } if ($self->categorical_clinical_data_file) { $clinical_data{'categ'} = $self->categorical_clinical_data_file; } if ($self->glm_clinical_data_file) { $clinical_data{'glm'} = $self->glm_clinical_data_file; } # create array of all sample names possibly included from clinical data and MAF my @all_sample_names; # names of all the samples, no matter if they are mutated or not my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n"; while( my $line = $sampleFh->getline ) { next if ( $line =~ m/^#/ ); chomp( $line ); my ( $sample ) = split( /\t/, $line ); push( @all_sample_names, $sample ); } $sampleFh->close; # loop through clinical data files and assemble survival data hash (vital_status and days_to_last_followup required); my %survival_data; my $vital_status_flag = 0; my $days_to_last_follow_flag = 0; for my $clin_file (keys %clinical_data) { #check filehandle my $clin_fh = new IO::File $clinical_data{$clin_file},"r"; unless ($clin_fh) { $self->error_message("Failed to open $clinical_data{$clin_file} for reading: $!"); return; } #initiate variables to hold column info my %phenotypes_to_print; my $vital_status_col = 0; my $days_to_last_follow_col = 0; #parse header and record column locations for needed data my $header = $clin_fh->getline; my @header_fields = split /\t/,$header; for (my $i = 1; $i <= $#header_fields; $i++) { #sample ID should be in first column of file my $field = $header_fields[$i]; if ($field =~ /vital_status|vitalstatus/i) { $vital_status_col = $i; $vital_status_flag++; } if ($field =~ /days_to_last_(follow_up|followup)|daystolastfollowup/i) { $days_to_last_follow_col = $i; $days_to_last_follow_flag++; } if (scalar grep { /^$field$/i } @phenotypes_to_include) { $phenotypes_to_print{$field} = $i; } } #read through clinical data file and store needed data in a hash while (my $line = $clin_fh->getline) { chomp $line; my @fields = split /\t/,$line; my $sample = $fields[0]; unless (scalar grep { m/^$sample$/ } @all_sample_names) { $self->status_message("Skipping sample $sample. (Sample is not in --bam-list)."); next; } if ($vital_status_col) { my $vital_status; if ($fields[$vital_status_col] =~ /^(0|living)$/i) { $vital_status = 0; } elsif ($fields[$vital_status_col] =~ /^(1|deceased)$/i) { $vital_status = 1; } else { $vital_status = "NA"; } $survival_data{$sample}{'vital_status'} = $vital_status; } if ($days_to_last_follow_col) { $survival_data{$sample}{'days'} = $fields[$days_to_last_follow_col]; } for my $pheno (keys %phenotypes_to_print) { $survival_data{$sample}{$pheno} = $fields[$phenotypes_to_print{$pheno}]; } } $clin_fh->close; # record phenotypes included from clinical data push @clinical_phenotypes_to_include, keys %phenotypes_to_print; } # check for necessary header fields unless ($vital_status_flag) { $self->error_message('Clinical data does not seem to contain a column labeled "vital_status".'); return; } unless ($days_to_last_follow_flag) { $self->error_message('Clnical data does not seem to contain a column labeled "days_to_last_followup".'); return; } # create temporary files for R command my $survival_data_file = Genome::Sys->create_temp_file_path(); my $mutation_matrix = Genome::Sys->create_temp_file_path(); # print survival data (temp file) my $surv_fh = new IO::File $survival_data_file,"w" or die "Couldn't open survival data filehandle."; print $surv_fh join("\t","Sample","Days_To_Last_Followup","Vital_Status"); if (@clinical_phenotypes_to_include) { print $surv_fh "\t" . join("\t",@clinical_phenotypes_to_include); } print $surv_fh "\n"; for my $sample (keys %survival_data) { unless (exists $survival_data{$sample}{'days'}) { $survival_data{$sample}{'days'} = "NA"; } unless (exists $survival_data{$sample}{'vital_status'}) { $survival_data{$sample}{'vital_status'} = "NA"; } print $surv_fh join("\t",$sample,$survival_data{$sample}{'days'},$survival_data{$sample}{'vital_status'}); for my $pheno (@clinical_phenotypes_to_include) { unless (exists $survival_data{$sample}{$pheno}) { $survival_data{$sample}{$pheno} = "NA"; } print $surv_fh "\t" . $survival_data{$sample}{$pheno}; } print $surv_fh "\n"; } $surv_fh->close; # find if any of the "phenotypes_to_include" are genes, and if so, limit the MAF mutation matrix to those genes my %clinical_pheno_to_include; @clinical_pheno_to_include{@clinical_phenotypes_to_include} = (); for my $item (@phenotypes_to_include) { push @mutated_genes_to_include,$item unless exists $clinical_pheno_to_include{$item}; } my $mutated_genes_to_include = \@mutated_genes_to_include; # create mutation matrix file if( $genetic_data_type =~ /^gene$/i ) { $self->create_sample_gene_matrix_gene( $mutation_matrix, $mutated_genes_to_include, @all_sample_names ); } elsif( $genetic_data_type =~ /^variant$/i ) { $self->create_sample_gene_matrix_variant( $mutation_matrix, $mutated_genes_to_include, @all_sample_names ); } else { $self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." ); return; } # check and prepare output directory my $output_dir = $self->output_dir . "/"; unless (-e $output_dir) { $self->status_message("Creating output directory: $output_dir..."); unless(mkdir $output_dir) { $self->error_message("Failed to create output directory: $!"); return; } } # set up R command my $R_cmd = "R --slave --args < " . __FILE__ . ".R " . join( " ", $survival_data_file, $mutation_matrix, $legend_placement, $output_dir ); print "R_cmd:\n$R_cmd\n"; #run R command WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)"; return(1); }
sub create_sample_gene_matrix_gene {
my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_; #create hash of mutations from the MAF file my ( %mutations, %all_genes ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $mutation_class, $sample ) = @cols[0,8,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } #check to see if this gene is on the list (if there is a list at all) if( @{$mutated_genes_to_include} ) { next unless( scalar grep { m/^$gene$/ } @{$mutated_genes_to_include} ); } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } $all_genes{$gene}++; $mutations{$sample}{$gene}++; } $maf_fh->close; #sort @all_genes for consistency in header and loops my @all_genes = sort keys %all_genes; #write the input matrix for R code to a temp file my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!"; #print input matrix file header my $header = join( "\t", "Sample", @all_genes ); $matrix_fh->print( "$header\n" ); #print mutation relation input matrix for my $sample ( sort @all_sample_names ) { $matrix_fh->print( $sample ); for my $gene ( @all_genes ) { if( exists $mutations{$sample}{$gene} ) { $matrix_fh->print( "\t$mutations{$sample}{$gene}" ); } else { $matrix_fh->print( "\t0" ); } } $matrix_fh->print( "\n" ); } }
sub create_sample_gene_matrix_variant {
my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_; #create hash of mutations from the MAF file my ( %variants_hash, %all_variants ); #parse the MAF file and fill up the mutation status hashes my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n"; while( my $line = $maf_fh->getline ) { next if( $line =~ m/^(#|Hugo_Symbol)/ ); chomp $line; my @cols = split( /\t/, $line ); my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15]; #check that the mutation class is valid if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) { $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" ); return; } #check to see if this gene is on the list (if there is a list at all) if( @{$mutated_genes_to_include} ) { next unless (scalar grep { m/^$gene$/ } @{$mutated_genes_to_include}); } # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) || ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) { print "Skipping $mutation_class mutation in gene $gene.\n"; next; } my $var; my $variant_name; if( $ref eq $var1 ) { $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref eq $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } elsif( $ref ne $var1 && $ref ne $var2 ) { $var = $var1; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; $var = $var2; $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var; $variants_hash{$sample}{$variant_name}++; $all_variants{$variant_name}++; } } $maf_fh->close; #sort variants for consistency my @variant_names = sort keys %all_variants; #write the input matrix for R code to a file my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!"; #print input matrix file header my $header = join("\t","Sample",@variant_names); $matrix_fh->print("$header\n"); #print mutation relation input matrix for my $sample (sort @all_sample_names) { $matrix_fh->print($sample); for my $variant (@variant_names) { if (exists $variants_hash{$sample}{$variant}) { $matrix_fh->print("\t$variants_hash{$sample}{$variant}"); } else { $matrix_fh->print("\t0"); } } $matrix_fh->print("\n"); } }
1;
2020-11-06 | perl v5.30.3 |