Scroll to navigation

Genome::Model::Tools::Music::Survival(3pm) User Contributed Perl Documentation Genome::Model::Tools::Music::Survival(3pm)

  • Headers are required.
  • The first column of each clinical data file must contain sample IDs which match those in both the --bam-list and the MAF variant list (in the MAF, this is the Tumor_Sample_Barcode column, specifically).
  • In at least one of the clinical data files input, columns with headers "vital_status" and "days_to_last_followup" (case insensitive) must exist. "vital_status" must be delineated by 1's and 0's, where 0 denotes 'living', and 1 denotes 'deceased'.

Note that all input files must be tab-separated.

HELP }

sub _additional_help_sections { return ( "ARGUMENTS", <<EOS

--bam-list
Provide a file containing sample names and normal/tumor BAM locations for each. Use the tab- delimited format [sample_name normal_bam tumor_bam] per line. This tool only needs sample_name, so all other columns can be skipped. The sample_name must be the same as the tumor sample names used in the MAF file (16th column, with the header Tumor_Sample_Barcode).

EOS ); }

sub _doc_authors { return <<EOS Nathan D. Dees, Ph.D. Qunyuan Zhang, Ph.D. EOS }

sub execute {

    # parse input arguments
    my $self = shift;
    my $bam_list = $self->bam_list;
    my $genetic_data_type = $self->genetic_data_type;
    my $legend_placement = $self->legend_placement;

    # handle phenotype inclusions
    my @phenotypes_to_include;
    my @clinical_phenotypes_to_include;
    my @mutated_genes_to_include;
    if ($self->phenotypes_to_include) { @phenotypes_to_include = split /,/,$self->phenotypes_to_include; }

    # check genetic data type
    unless ($genetic_data_type =~ /^gene|variant$/i) {
        $self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter.");
        return;
    }

    # load clinical data and analysis types
    my %clinical_data;
    if ($self->numeric_clinical_data_file) {
        $clinical_data{'numeric'} = $self->numeric_clinical_data_file;
    }
    if ($self->categorical_clinical_data_file) {
        $clinical_data{'categ'} = $self->categorical_clinical_data_file;
    }
    if ($self->glm_clinical_data_file) {
        $clinical_data{'glm'} = $self->glm_clinical_data_file;
    }

    # create array of all sample names possibly included from clinical data and MAF
    my @all_sample_names; # names of all the samples, no matter if they are mutated or not
    my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
    while( my $line = $sampleFh->getline )
    {
        next if ( $line =~ m/^#/ );
        chomp( $line );
        my ( $sample ) = split( /\t/, $line );
        push( @all_sample_names, $sample );
    }
    $sampleFh->close;

    # loop through clinical data files and assemble survival data hash (vital_status and days_to_last_followup required);
    my %survival_data;
    my $vital_status_flag = 0;
    my $days_to_last_follow_flag = 0;

    for my $clin_file (keys %clinical_data) {

        #check filehandle
        my $clin_fh = new IO::File $clinical_data{$clin_file},"r";
        unless ($clin_fh) {
            $self->error_message("Failed to open $clinical_data{$clin_file} for reading: $!");
            return;
        }

        #initiate variables to hold column info
        my %phenotypes_to_print;
        my $vital_status_col = 0;
        my $days_to_last_follow_col = 0;

        #parse header and record column locations for needed data
        my $header = $clin_fh->getline;
        my @header_fields = split /\t/,$header;
        for (my $i = 1; $i <= $#header_fields; $i++) { #sample ID should be in first column of file
            my $field = $header_fields[$i];
            if ($field =~ /vital_status|vitalstatus/i) { $vital_status_col = $i; $vital_status_flag++; }
            if ($field =~ /days_to_last_(follow_up|followup)|daystolastfollowup/i) { $days_to_last_follow_col = $i; $days_to_last_follow_flag++; }
            if (scalar grep { /^$field$/i } @phenotypes_to_include) { $phenotypes_to_print{$field} = $i; }
        }

        #read through clinical data file and store needed data in a hash
        while (my $line = $clin_fh->getline) {
            chomp $line;
            my @fields = split /\t/,$line;
            my $sample = $fields[0];
            unless (scalar grep { m/^$sample$/ } @all_sample_names) {
                $self->status_message("Skipping sample $sample. (Sample is not in --bam-list).");
                next;
            }
            if ($vital_status_col) {
                my $vital_status;
                if ($fields[$vital_status_col] =~ /^(0|living)$/i) { $vital_status = 0; }
                elsif ($fields[$vital_status_col] =~ /^(1|deceased)$/i) { $vital_status = 1; }
                else { $vital_status = "NA"; }
                $survival_data{$sample}{'vital_status'} = $vital_status;
            }
            if ($days_to_last_follow_col) { $survival_data{$sample}{'days'} = $fields[$days_to_last_follow_col]; }
            for my $pheno (keys %phenotypes_to_print) { $survival_data{$sample}{$pheno} = $fields[$phenotypes_to_print{$pheno}]; }
        }
        $clin_fh->close;

        # record phenotypes included from clinical data
        push @clinical_phenotypes_to_include, keys %phenotypes_to_print;

    }

    # check for necessary header fields
    unless ($vital_status_flag) {
        $self->error_message('Clinical data does not seem to contain a column labeled "vital_status".');
        return;
    }
    unless ($days_to_last_follow_flag) {
        $self->error_message('Clnical data does not seem to contain a column labeled "days_to_last_followup".');
        return;
    }

    # create temporary files for R command
    my $survival_data_file = Genome::Sys->create_temp_file_path();
    my $mutation_matrix = Genome::Sys->create_temp_file_path();

    # print survival data (temp file)
    my $surv_fh = new IO::File $survival_data_file,"w" or die "Couldn't open survival data filehandle.";
    print $surv_fh join("\t","Sample","Days_To_Last_Followup","Vital_Status");
    if (@clinical_phenotypes_to_include) { print $surv_fh "\t" . join("\t",@clinical_phenotypes_to_include); }
    print $surv_fh "\n";
    for my $sample (keys %survival_data) {
        unless (exists $survival_data{$sample}{'days'}) { $survival_data{$sample}{'days'} = "NA"; }
        unless (exists $survival_data{$sample}{'vital_status'}) { $survival_data{$sample}{'vital_status'} = "NA"; }
        print $surv_fh join("\t",$sample,$survival_data{$sample}{'days'},$survival_data{$sample}{'vital_status'});
        for my $pheno (@clinical_phenotypes_to_include) {
            unless (exists $survival_data{$sample}{$pheno}) { $survival_data{$sample}{$pheno} = "NA"; }
            print $surv_fh "\t" . $survival_data{$sample}{$pheno};
        }
        print $surv_fh "\n";
    }
    $surv_fh->close;

    # find if any of the "phenotypes_to_include" are genes, and if so, limit the MAF mutation matrix to those genes
    my %clinical_pheno_to_include;
    @clinical_pheno_to_include{@clinical_phenotypes_to_include} = ();
    for my $item (@phenotypes_to_include) {
        push @mutated_genes_to_include,$item unless exists $clinical_pheno_to_include{$item};
    }
    my $mutated_genes_to_include = \@mutated_genes_to_include;

    # create mutation matrix file
    if( $genetic_data_type =~ /^gene$/i ) {
        $self->create_sample_gene_matrix_gene( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
    }
    elsif( $genetic_data_type =~ /^variant$/i ) {
        $self->create_sample_gene_matrix_variant( $mutation_matrix, $mutated_genes_to_include, @all_sample_names );
    }
    else {
        $self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." );
        return;
    }

    # check and prepare output directory
    my $output_dir = $self->output_dir . "/";
    unless (-e $output_dir) {
        $self->status_message("Creating output directory: $output_dir...");
        unless(mkdir $output_dir) {
            $self->error_message("Failed to create output directory: $!");
            return;
        }
    }

    # set up R command
    my $R_cmd = "R --slave --args < " . __FILE__ . ".R " . join( " ", $survival_data_file, $mutation_matrix, $legend_placement, $output_dir );
    print "R_cmd:\n$R_cmd\n";

    #run R command
    WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)";

    return(1);
}

sub create_sample_gene_matrix_gene {

    my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;

    #create hash of mutations from the MAF file
    my ( %mutations, %all_genes );

    #parse the MAF file and fill up the mutation status hashes
    my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
    while( my $line = $maf_fh->getline ) {
        next if( $line =~ m/^(#|Hugo_Symbol)/ );
        chomp $line;
        my @cols = split( /\t/, $line );
        my ( $gene, $mutation_class, $sample ) = @cols[0,8,15];

        #check that the mutation class is valid
        if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
            $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
            return;
        }

        #check to see if this gene is on the list (if there is a list at all)
        if( @{$mutated_genes_to_include} ) {
            next unless( scalar grep { m/^$gene$/ } @{$mutated_genes_to_include} );
        }

        # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
        if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
           ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
            print "Skipping $mutation_class mutation in gene $gene.\n";
            next;
        }

        $all_genes{$gene}++;
        $mutations{$sample}{$gene}++;
    }
    $maf_fh->close;

    #sort @all_genes for consistency in header and loops
    my @all_genes = sort keys %all_genes;

    #write the input matrix for R code to a temp file
    my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";

    #print input matrix file header
    my $header = join( "\t", "Sample", @all_genes );
    $matrix_fh->print( "$header\n" );

    #print mutation relation input matrix
    for my $sample ( sort @all_sample_names ) {
        $matrix_fh->print( $sample );
        for my $gene ( @all_genes ) {
            if( exists $mutations{$sample}{$gene} ) {
                $matrix_fh->print( "\t$mutations{$sample}{$gene}" );
            }
            else {
                $matrix_fh->print( "\t0" );
            }
        }
        $matrix_fh->print( "\n" );
    }
}

sub create_sample_gene_matrix_variant {

    my ( $self, $mutation_matrix, $mutated_genes_to_include, @all_sample_names ) = @_;

    #create hash of mutations from the MAF file
    my ( %variants_hash, %all_variants );

    #parse the MAF file and fill up the mutation status hashes
    my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
    while( my $line = $maf_fh->getline ) {
        next if( $line =~ m/^(#|Hugo_Symbol)/ );
        chomp $line;
        my @cols = split( /\t/, $line );
        my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];

        #check that the mutation class is valid
        if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
            $self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
            return;
        }

        #check to see if this gene is on the list (if there is a list at all)
        if( @{$mutated_genes_to_include} ) {
            next unless (scalar grep { m/^$gene$/ } @{$mutated_genes_to_include});
        }

        # If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
        if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
           ( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
            print "Skipping $mutation_class mutation in gene $gene.\n";
            next;
        }

        my $var;
        my $variant_name;
        if( $ref eq $var1 ) {
            $var = $var2;
            $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
            $variants_hash{$sample}{$variant_name}++;
            $all_variants{$variant_name}++;
        }
        elsif( $ref eq $var2 ) {
            $var = $var1;
            $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
            $variants_hash{$sample}{$variant_name}++;
            $all_variants{$variant_name}++;
        }
        elsif( $ref ne $var1 && $ref ne $var2 ) {
            $var = $var1;
            $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
            $variants_hash{$sample}{$variant_name}++;
            $all_variants{$variant_name}++;
            $var = $var2;
            $variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
            $variants_hash{$sample}{$variant_name}++;
            $all_variants{$variant_name}++;
        }
    }
    $maf_fh->close;

    #sort variants for consistency
    my @variant_names = sort keys %all_variants;

    #write the input matrix for R code to a file
    my $matrix_fh = new IO::File $mutation_matrix,"w" or die "Failed to create matrix file $mutation_matrix!: $!";

    #print input matrix file header
    my $header = join("\t","Sample",@variant_names);
    $matrix_fh->print("$header\n");

    #print mutation relation input matrix
    for my $sample (sort @all_sample_names) {
        $matrix_fh->print($sample);
        for my $variant (@variant_names) {
            if (exists $variants_hash{$sample}{$variant}) {
                $matrix_fh->print("\t$variants_hash{$sample}{$variant}");
            }
            else {
                $matrix_fh->print("\t0");
            }
        }
        $matrix_fh->print("\n");
    }
}

1;

2018-07-05 perl v5.26.2