#!/usr/bin/perl # ppipe2dna.pl use strict; use Bio::DB::Fasta; my($genome,$realtr,$pseudoaa)= @ARGV; die "usage: ppipe2dna genome.fa realgene.tr pseudopipealign.fa convert pseudopipe amino alignment to realgene x pseudogene dna align " unless (-f $pseudoaa and -f $realtr and -f $genome); my $gdb= Bio::DB::Fasta->new($genome); my $rdb= Bio::DB::Fasta->new($realtr); open(AA,$pseudoaa); my($gene, $pgene, $gloc, $aastart,$aaend, @aa, $exons); while () { if(/^>(\S+)(.*)/){ my($aid,$ahead)=($1,$2); print_tr($gene, $pgene, $gloc, $exons, $aastart,$aaend, \@aa) if($gene and $gloc); @aa=(); my @hd = split " ", $aid ." " .$ahead; # ..........^ really want tab split, some flds have space; no tabs in .fa?? # o19?: (273415..273748 273812..274320) << need this exon location ($exons)= $ahead =~ m/\(([^\)]+)\)/; my ($scaf)= $aid =~ m/chr(\d+)/; $gloc= "scaffold_${scaf}:$hd[2]-$hd[3]"; $gene= $hd[5]; $aastart= $hd[6]; $aaend= $hd[6]; $pgene= $aid; } elsif(/^\w/) { chomp; push(@aa,$_); # should be only 2: query prot, dna-tr } } close(AA); print_tr($gene, $pgene,$gloc, $exons, $aastart,$aaend,\@aa) if($gene and $gloc); sub print_tr { my( $realgene, $pseudogene, $glocation, $exons, $aastart, $aaend, $palign)= @_; my $realaa = $palign->[0]; my $pseudoaa = $palign->[1]; # dna-tr align my $genetr= $rdb->seq($realgene); my @exons= split " ",$exons; my @pseudoaa= split " ", $pseudoaa; my @realaa= split " ", $realaa; my $exonfa=""; # my @exonfa=(); my ($ref)= $glocation =~ m/^(\w+)/; foreach my $ix (0..$#exons) { my $exloc = $exons[$ix]; my $psiaa = $pseudoaa[$ix]; my $realaa= $realaa[$ix]; my $exon = $gdb->seq("$ref:$exloc"); my $ina= 0; my $exna=""; if($ix==0 and $aastart>1) { $exna= ("-") x (3 * $aastart-1); } ## need to find codon phase for ix>0 ## this is only for realtr ajustment at exon end/start ? if($ix>0) { my $nx= length($exonfa); my $cb= substr($genetr,$nx,3); if($exon !~ m/^$cb/) { my $eb= substr($exon,0,3); my $ib= index($genetr,$eb,$nx); my $off= $ib - $nx; # off by 1? if($off>0 and $off < 6) { $exna.= ('-') x $off; } } } my $naa= length($psiaa); for (my $iaa=0; $iaa<$naa; $iaa++) { my $paa = substr($psiaa,$iaa,1); my $raa = substr($realaa,$iaa,1); if($raa eq '-' and $paa eq '\\') { $ina--; $exna =~ s/..$//; } #? means last codon shifted by 1 base? elsif($raa eq '-' and $paa eq '/') { $ina++; $exna .="--"; } # forward shift elsif($raa eq '-') { $ina+=3; } elsif($paa eq '-') { $exna.="---"; } elsif($paa eq '\\') { $ina--; } # $exna.="-"; #? backspace elsif($paa eq '/') { $ina++;} # $exna.="--"; else { $exna.= substr($exon,$ina,3); $ina+=3; } } # push(@exonfa, $exna); # dont need both, need $exonfa $exonfa.= $exna; } print "gene: $gene\n"; print "pseudogene: $pseudogene\n"; print "gloc: $glocation ($exons)\n"; print "real-aa : $realaa\n"; print "pseudo-aa: $pseudoaa\n"; print ">$realgene\n$genetr\n"; print ">$pseudogene\n",$exonfa,"\n"; #join(",",@exonfa) } __END__