samtools mpileup原始结果文件处理,非bcf/vcf格式

samtools mpileup处理的原始结果向来比较麻烦,写了一个小程序只生成碱基reads,但不进行任何过滤:

#!/usr/bin/env perl 
use strict;
use warnings;
use Getopt::Long;

my %opt;
GetOptions(\%opt, "i=s", "o=s", "sd=s", "ad=s", "aq=s");

my $usage = <<"USAGE";

	Usage: perl $0 -i <pileup>  [options]
	CreationTime: 
	ModifyTime: 
	Note: 

	Options:
	-i			input file

	eg: perl $0 -i file.pileup > result

USAGE

die $usage if(!$opt{i});

open IN, ($opt{i} =~ /\.gz$/) ? "gzip -dc $opt{i} |" : $opt{i} or die $!;
my (@tmp, @qual, @base, %index);
while(<IN>)
{
	chomp;
	@tmp = split /\t/;
	next if($tmp[2] eq "N");

	while($tmp[4] =~ /[\+-](\d+)[ATCGNatcgn]+/)
	{
		$tmp[4] =~ s/[\+-]\d+[ATCGNatcgn]{$1}//;
	}
	$tmp[4] =~ s/\^\S//g; #\<\=\>\-\!\?\/\'\"\)\(\]\[\*\\\.\+,;:@&#%0-9A-Z
	$tmp[4] =~ s/[\^\$]//g;

	@base = split //, $tmp[4];
	
	my %hash = ('r'=>0, 'A'=>0, 'T'=>0, 'C'=>0, 'G'=>0);
	foreach my $b(@base)
	{
		if($b =~ /[\.,]/)
		{
			$hash{'r'} ++;
			next;
		}
		if($b=~ /[Aa]/)
		{
			$hash{'A'} ++;
			next;
		}
		if($b =~ /[Tt]/)
		{
			$hash{'T'} ++;
			next;
		}
		if($b =~ /[Cc]/)
		{
			$hash{'C'} ++;
			next;
		}
		if($b =~ /[Gg]/)
		{
			$hash{'G'} ++;
			next;
		}
	}
	my ($m, $fb, $fbn, $sb, $sbn);
	foreach my $i(sort {$hash{$b} <=> $hash{$a}} keys %hash)
	{
		$m ++;
		if($m == 1 and $i eq "r")
		{
			$fb = $tmp[2];
			$fbn = $hash{$i};
			next;
		}elsif($m == 1){
			$fb = $i;
			$fbn = $hash{$i};
			next;
		}
		if($m == 2 and $i eq 'r')
		{
			$sb = $tmp[2];
			$sbn = $hash{$i};
			last;
		}elsif($m == 2){
			$sb = $i;
			$sbn = $hash{$i};
			last;
		}
	}
	print join "\t", @tmp[0..2], $fb, $fbn, $sb, $sbn, "\n";
}
close IN;

sub showtime
{
	my ($x) = @_;
	my $t = localtime;
	print STDERR "[$t] $x\n";
}
尤其是没有过滤质量值,所以得到的reads肯定是不准确的,另外一个过滤版本由于某些经验上的判定原因暂不发布。
阅读更多

更多精彩内容