1. 程式人生 > >從NCBI基因組資料中獲得cds,pep和geneID對應表

從NCBI基因組資料中獲得cds,pep和geneID對應表

在做基因組相關分析時,我們常常需要從基因組中提取cds,並翻譯成相應的pep序列。此指令碼,以NCBI資料庫中標準的基因組序列檔案和對應的gff檔案為輸入檔案,快速獲得cds序列,pep序列,RNA,Protein和gene的對應關係表等相關檔案。

A perl script which deals  with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

使用方法如下:

perl    $0    gff_filegenomes_fa_file        file_prefix

#! /usr/bin/perl -w

=head1 #######################################
	
=head1 name
	grep_cds_pep_from_ncbi_genomes_datas.pl

=head1 description
	deal with ncbi raw data,and from which get cds ,pep and gene,mRNA and protein ID list.

=head1 example
	perl  $0 ref_ConCri1.0_top_level.gff3.gz ccr_ref_ConCri1.0_chrUn.fa.gz  mole
	perl  $0 gff_file genomes_fa_file  file_prefix

=head1 author
	original from Xiangfeng Li,
[email protected]
##2014-4-19/21## =head1 ####################################### =cut use strict; die `pod2text $0` unless @ARGV==3; my ($gff,$fa,$prefix)[email protected]; ##deal gff file## $gff=~/gz$/ ? (open GFF,"gzip -cd $gff|"||die):(open GFF,$gff||die); my (%mrna,%cds,%pep); while(<GFF>){ chomp; next if(/^#/); my @p=split/\t/,$_; my @q=split/;/,$p[8]; my ($rna,$pep,$nt,$gene); my $chr=$p[0]; if($p[2] eq "mRNA"){ ($rna=$q[0])=~s/ID=//; ($nt=$q[1])=~s/Name=//; ($gene=$q[-3])=~s/gene=//; $mrna{$chr}{$rna}{strand}=$p[6]; $cds{$rna}=[$gene,$nt]; } if($p[2] eq "CDS"){ ($pep=$q[1])=~s/Name=//; ($rna=$q[2])=~s/Parent=//; push @{$mrna{$chr}{$rna}{nt}},[$p[3],$p[4]]; $pep{$rna}=$pep; } } close GFF; ##get id list## my %anno; my $id_gene_cds_pep=$prefix."_id_gene_cds_pep.lst"; open ID, ">",$id_gene_cds_pep||die; foreach my $i(sort keys %pep){ if($cds{$i}){ my $out=join "\t",$i,$cds{$i}[0],$cds{$i}[1],$pep{$i}; print ID $out,"\n"; ($anno{$i}=$out)=~s/\t/\|/g; } } warn "create file:$id_gene_cds_pep\n"; close ID; ##deal fa file## my %max; $fa=~/gz$/ ? (open FA,"gzip -cd $fa|"||die):(open FA,$fa||die); my $raw_cds=$prefix."_raw_cds"; open CDS1,">$raw_cds"||die; my ($start,$end); $/=">";<FA>;$/="\n"; while(<FA>){ my $name=$1 if(/(\S+)/); my $info=(split/\|/,$name)[-1]; $/=">"; my $seq=<FA>; $/="\n"; $seq=~s/>|\n+//g; my $scaf=$mrna{$info}; foreach my $k(sort keys %$scaf){ next if(! exists $scaf->{$k}{nt}); my @
[email protected]
{$scaf->{$k}{nt}}; my $strand=$$scaf{$k}{strand}; my $get; @p=sort{$a->[0]<=>$b->[0]}@p; my $loc1=$p[0][0]; my $loc2=$p[-1][1]; my ($get_len,$add,$out,$gene); if(exists $anno{$k}){ $add=$anno{$k}; $gene=(split/\|/,$add)[1]; }else{next;} foreach(@p){ ($start,$end)
[email protected]
$_[0,1]; $get.=uc(substr($seq,$start-1,$end-$start+1)); } if($strand eq "+"){ $get_len=length$get; $get=~s/([A-Z]{50})/$1\n/g; chop($get) unless($get_len%50); $out=">$add LOC=$info :$loc1:$loc2:+ length=$get_len\n$get\n"; push @{$max{$gene}},[$get_len,$out]; print CDS1 $out; } if($strand eq "-"){ $get=&reverse_complement($get); $get_len=length$get; $get=~s/([A-Z]{50})/$1\n/g; chop($get) unless($get_len%50); $out=">$add LOC=$info :$loc1:$loc2:- length=$get_len\n$get\n"; push @{$max{$gene}},[$get_len,$out]; print CDS1 $out; } } } warn "create file:$raw_cds\n"; close FA; close CDS1; ##get max transcript## my $filter_cds=$prefix."_filter_cds"; open CDS2,">$filter_cds"||die; my @a; foreach my $j(keys %max){ my @[email protected]{$max{$j}}; @trans=sort{$a->[0] cmp $b->[0]}@trans; push @a,$trans[-1][1]; } my @a_new; foreach(@a){ my $r=$1 if(/^>rna(\d+)/); push @a_new,[$r,$_]; } my @cds_sort=map{$_->[1]} sort{$a->[0] <=> $b->[0]}@a_new; print CDS2 $_ [email protected]_sort; close CDS2; warn "create file:$filter_cds\n"; ##get raw pep sequences## my $raw_pep=$prefix."_raw_pep"; open PEP1,">",$raw_pep||die; my @raw_pep=&cds2pep($raw_cds); print PEP1 $_ [email protected]_pep; close PEP1; warn "create file:$raw_pep\n"; ##get filter pep sequences## my $filter_pep=$prefix."_filter_pep"; open PEP2,">$filter_pep"||die; my @filter_pep=&cds2pep($filter_cds); print PEP2 $_ [email protected]_pep; close PEP2; warn "create file:$filter_pep\n"; ##add label for cds and pep of filter## my $label=uc($prefix); open IN1,$filter_cds||die; my $filter_cds_label=$prefix."_filter_cds_label"; open OUT1,">",$filter_cds_label||die; while(<IN1>){ chomp; if(/^>/){ my @a=split/\|/,$_,2; my $name=$a[0]."_$label"; print OUT1"$name |$a[1]\n"; }else{print OUT1 "$_\n";} } close IN1; close OUT1; warn "create file:$filter_cds_label\n"; open IN2,$filter_pep||die; my $filter_pep_label=$prefix."_filter_pep_label"; open OUT2,">",$filter_pep_label||die; while(<IN2>){ chomp; if(/^>/){ my @a=split/\|/,$_,2; my $name=$a[0]."_$label"; print OUT2 "$name |$a[1]\n"; }else{print OUT2 "$_\n";} } close IN2; close OUT2; warn "create file:$filter_pep_label\n"; ##timing## my $time=times; my $time_out=sprintf "%.2f",$time/60; print "##########\nElapsed Time :$time_out mins\n##########\n"; ##subroutine## sub reverse_complement{ my ($seq)=shift; $seq=reverse$seq; $seq=~tr/AaGgCcTt/TtCcGgAa/; return $seq; } ##subroutine## sub cds2pep{ my $file=shift; my %code = ( "standard" => { 'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', # Alanine 'TGC' => 'C', 'TGT' => 'C', # Cysteine 'GAC' => 'D', 'GAT' => 'D', # Aspartic Aci 'GAA' => 'E', 'GAG' => 'E', # Glutamic Aci 'TTC' => 'F', 'TTT' => 'F', # Phenylalanin 'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', # Glycine 'CAC' => 'H', 'CAT' => 'H', # Histidine 'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', # Isoleucine 'AAA' => 'K', 'AAG' => 'K', # Lysine 'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L', # Leucine 'ATG' => 'M', # Methionine 'AAC' => 'N', 'AAT' => 'N', # Asparagine 'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', # Proline 'CAA' => 'Q', 'CAG' => 'Q', # Glutamine 'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R', # Arginine 'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S', # Serine 'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', # Threonine 'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', # Valine 'TGG' => 'W', # Tryptophan 'TAC' => 'Y', 'TAT' => 'Y', # Tyrosine 'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U' # Stop } ## more translate table could be added here in future ## more translate table could be added here in future ## more translate table could be added here in future ); open IN,$file||die; $/=">";<IN>;$/="\n"; my @results_set; while(<IN>){ my $info=$_; my @a=split/\s+/,$info; $/=">";my $seq=<IN>;$/="\n"; $seq=~s/\n|>//g; my $len=length$seq; my $info_out=join " ",@a[0..($#a-1)]; my ($pep_out,$triplet); for(my $i=0;$i<$len;$i+=3){ $triplet=substr($seq,$i,3); next if(length$triplet!=3); if(exists $code{standard}{$triplet}){ $pep_out.=$code{standard}{$triplet}; }else{$pep_out.="X"} } $pep_out=~s/U$// if($pep_out=~/U$/); my $pep_len=length$pep_out; $pep_out=~s/([A-Z]{50})/$1\n/g; chop($pep_out) unless($pep_len%50); my $results= ">$info_out length=$pep_len\n$pep_out\n"; push @results_set,$results; } return @results_set; } __END__