table of contents
| Genome::Model::Tools::Music::ClinicalCorrelation(3pm) | User Contributed Perl Documentation | Genome::Model::Tools::Music::ClinicalCorrelation(3pm) |
- •
- Headers are required
- •
- Each file must include at least 1 sample_id column and 1 attribute column, with the format being [sample_id clinical_data_attribute_1 clinical_data_attribute_2 ...]
- •
- The sample ID must match the sample ID listed in the MAF under "Tumor_Sample_Barcode" for relating the mutations of this sample.
- •
- Columns must be ordered as such:
- •
- [ analysis_type clinical_data_trait_name variant/gene_name covariates memo ]
- •
- The 'analysis_type' column must contain either "Q", indicating a quantative trait, or "B", indicating a binary trait will be examined.
- •
- The 'clinical_data_trait_name' is the name of a clinical data trait defined by being a header in the --glm-clinical-data-file.
- •
- The 'variant/gene_name' can either be the name of one or more columns from the --glm-clinical-data-file, or the name of one or more mutated gene names from the MAF, separated by "|". If this column is left blank, or instead contains "NA", then each column from either the variant mutation matrix (--use-maf-in-glm) or alternatively the --glm-clinical-data-file is used consecutively as the variant column in independent analyses.
- •
- 'covariates' are the names of one or more columns from the --glm-clinical-data-file, separated by "+".
- •
- 'memo' is any note deemed useful to the user. It will be printed in the output data file for reference.
return (
"ARGUMENTS", <<EOS
- --bam-list
- Provide a file containing sample names and normal/tumor BAM locations for each. Use the tab- delimited format [sample_name normal_bam tumor_bam] per line. This tool only needs sample_name, so all other columns can be skipped. The sample_name must be the same as the tumor sample names used in the MAF file (16th column, with the header Tumor_Sample_Barcode).
); } sub _doc_authors {
return <<EOS
Nathan D. Dees, Ph.D.
Qunyuan Zhang, Ph.D.
William Schierding, M.S. EOS } sub execute {
# parse input arguments
my $self = shift;
my $bam_list = $self->bam_list;
my $output_file = $self->output_file;
my $genetic_data_type = $self->genetic_data_type;
# check genetic data type
unless( $genetic_data_type =~ /^gene|variant$/i ) {
$self->error_message("Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter.");
return;
}
# load clinical data and analysis types
my %clinical_data;
if( $self->numeric_clinical_data_file ) {
$clinical_data{'numeric'} = $self->numeric_clinical_data_file;
}
if( $self->categorical_clinical_data_file ) {
$clinical_data{'categ'} = $self->categorical_clinical_data_file;
}
if( $self->glm_clinical_data_file ) {
$clinical_data{'glm'} = $self->glm_clinical_data_file;
}
my $glm_model = $self->glm_model_file;
# declarations
my @all_sample_names; # names of all the samples, no matter if it's mutated or not
# parse out the sample names from the bam-list which should match the names in the MAF file
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline ) {
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
}
$sampleFh->close;
# loop through clinical data files
for my $datatype ( keys %clinical_data ) {
my $test_method;
my $full_output_filename;
if( $datatype =~ /numeric/i ) {
$full_output_filename = $output_file . ".numeric.csv";
$test_method = $self->numerical_data_test_method;
}
if( $datatype =~ /categ/i ) {
$full_output_filename = $output_file . ".categorical.csv";
$test_method = "fisher";
}
if( $datatype =~ /glm/i ) {
$full_output_filename = $output_file . ".glm.csv";
$test_method = "glm";
}
#read through clinical data file to see which samples are represented and create input matrix for R
my %samples;
my $matrix_file;
my $samples = \%samples;
my $clin_fh = new IO::File $clinical_data{$datatype},"r";
unless( $clin_fh ) {
die "failed to open $clinical_data{$datatype} for reading: $!";
}
my $header = $clin_fh->getline;
while( my $line = $clin_fh->getline ) {
chomp $line;
my ( $sample ) = split( /\t/, $line );
$samples{$sample}++;
}
#create correlation matrix unless it's glm analysis without using a maf file
unless(( $datatype =~ /glm/i && !$self->use_maf_in_glm ) || $self->input_clinical_correlation_matrix_file ) {
if( $genetic_data_type =~ /^gene$/i ) {
$matrix_file = $self->create_sample_gene_matrix_gene( $samples, $clinical_data{$datatype}, @all_sample_names );
}
elsif( $genetic_data_type =~ /^variant$/i ) {
$matrix_file = $self->create_sample_gene_matrix_variant( $samples, $clinical_data{$datatype}, @all_sample_names );
}
else {
$self->error_message( "Please enter either \"gene\" or \"variant\" for the --genetic-data-type parameter." );
return;
}
}
if( $self->input_clinical_correlation_matrix_file ) {
$matrix_file = $self->input_clinical_correlation_matrix_file;
}
unless( defined $matrix_file ) { $matrix_file = "'*'"; }
#set up R command
my $R_cmd = "R --slave --args < " . __FILE__ . ".R $test_method ";
if( $datatype =~ /glm/i ) {
$R_cmd .= "$glm_model $clinical_data{$datatype} $matrix_file $full_output_filename";
}
else {
$R_cmd .= "$clinical_data{$datatype} $matrix_file $full_output_filename";
}
#run R command
print "R_cmd:\n$R_cmd\n";
WIFEXITED( system $R_cmd ) or croak "Couldn't run: $R_cmd ($?)";
}
return( 1 );
}
sub create_sample_gene_matrix_gene {
my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_;
my $output_matrix = $self->clinical_correlation_matrix_file;
#create a hash of mutations from the MAF file
my ( %mutations, %all_genes, @all_genes );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $mutation_class, $sample ) = @cols[0,8,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
#check if sample exists in clinical data
unless( defined $samples->{$sample} ) {
warn "Sample Name: $sample from MAF file does not exist in Clinical Data File";
next;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
$all_genes{$gene}++;
$mutations{$sample}{$gene}++;
}
$maf_fh->close;
#sort @all_genes for consistency
@all_genes = sort keys %all_genes;
#write the input matrix for R code to a file
my $matrix_fh;
unless (defined $output_matrix) {
$output_matrix = Genome::Sys->create_temp_file_path();
}
$matrix_fh = new IO::File $output_matrix,"w";
unless ($matrix_fh) {
die "Failed to create matrix file $output_matrix!: $!";
}
#print input matrix file header
my $header = join("\t","Sample",@all_genes);
$matrix_fh->print("$header\n");
#print mutation relation input matrix
for my $sample (sort @all_sample_names) {
$matrix_fh->print($sample);
for my $gene (@all_genes) {
if (exists $mutations{$sample}{$gene}) {
$matrix_fh->print("\t$mutations{$sample}{$gene}");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
return $output_matrix;
}
sub create_sample_gene_matrix_variant {
my ( $self, $samples, $clinical_data_file, @all_sample_names ) = @_;
my $output_matrix = $self->clinical_correlation_matrix_file;
#create hash of mutations from the MAF file
my ( %variants_hash, %all_variants );
#parse the MAF file and fill up the mutation status hashes
my $maf_fh = IO::File->new( $self->maf_file ) or die "Couldn't open MAF file!\n";
while( my $line = $maf_fh->getline ) {
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp $line;
my @cols = split( /\t/, $line );
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref, $var1, $var2, $sample ) = @cols[0,4..6,8..12,15];
#check that the mutation class is valid
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ ) {
$self->error_message( "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\nPlease use TCGA MAF v2.3.\n" );
return;
}
unless( exists $samples->{$sample} ) {
warn "Sample Name: $sample from MAF file does not exist in Clinical Data File";
next;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $self->skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $self->skip_silent && $mutation_class =~ m/^Silent$/ )) {
print "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
my $var;
my $variant_name;
if( $ref eq $var1 ) {
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref eq $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
elsif( $ref ne $var1 && $ref ne $var2 ) {
$var = $var1;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
$var = $var2;
$variant_name = $gene."_".$chr."_".$start."_".$stop."_".$ref."_".$var;
$variants_hash{$sample}{$variant_name}++;
$all_variants{$variant_name}++;
}
}
$maf_fh->close;
#sort variants for consistency
my @variant_names = sort keys %all_variants;
#write the input matrix for R code to a file
my $matrix_fh;
unless( defined $output_matrix ) {
$output_matrix = Genome::Sys->create_temp_file_path();
}
$matrix_fh = new IO::File $output_matrix,"w";
unless( $matrix_fh ) {
die "Failed to create matrix file $output_matrix!: $!";
}
#print input matrix file header
my $header = join( "\t", "Sample", @variant_names );
$matrix_fh->print("$header\n");
#print mutation relation input matrix
for my $sample ( sort @all_sample_names ) {
$matrix_fh->print( $sample );
for my $variant ( @variant_names ) {
if( exists $variants_hash{$sample}{$variant} ) {
$matrix_fh->print("\t$variants_hash{$sample}{$variant}");
}
else {
$matrix_fh->print("\t0");
}
}
$matrix_fh->print("\n");
}
return $output_matrix;
}
1;| 2013-05-14 | perl v5.14.2 |