samtools mpileup处理的原始结果向来比较麻烦,写了一个小程序只生成碱基reads,但不进行任何过滤:
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
my %opt;
GetOptions(\%opt, "i=s", "o=s", "sd=s", "ad=s", "aq=s");
my $usage = <<"USAGE";
Usage: perl $0 -i <pileup> [options]
CreationTime:
ModifyTime:
Note:
Options:
-i input file
eg: perl $0 -i file.pileup > result
USAGE
die $usage if(!$opt{i});
open IN, ($opt{i} =~ /\.gz$/) ? "gzip -dc $opt{i} |" : $opt{i} or die $!;
my (@tmp, @qual, @base, %index);
while(<IN>)
{
chomp;
@tmp = split /\t/;
next if($tmp[2] eq "N");
while($tmp[4] =~ /[\+-](\d+)[ATCGNatcgn]+/)
{
$tmp[4] =~ s/[\+-]\d+[ATCGNatcgn]{$1}//;
}
$tmp[4] =~ s/\^\S//g; #\<\=\>\-\!\?\/\'\"\)\(\]\[\*\\\.\+,;:@&#%0-9A-Z
$tmp[4] =~ s/[\^\$]//g;
@base = split //, $tmp[4];
my %hash = ('r'=>0, 'A'=>0, 'T'=>0, 'C'=>0, 'G'=>0);
foreach my $b(@base)
{
if($b =~ /[\.,]/)
{
$hash{'r'} ++;
next;
}
if($b=~ /[Aa]/)
{
$hash{'A'} ++;
next;
}
if($b =~ /[Tt]/)
{
$hash{'T'} ++;
next;
}
if($b =~ /[Cc]/)
{
$hash{'C'} ++;
next;
}
if($b =~ /[Gg]/)
{
$hash{'G'} ++;
next;
}
}
my ($m, $fb, $fbn, $sb, $sbn);
foreach my $i(sort {$hash{$b} <=> $hash{$a}} keys %hash)
{
$m ++;
if($m == 1 and $i eq "r")
{
$fb = $tmp[2];
$fbn = $hash{$i};
next;
}elsif($m == 1){
$fb = $i;
$fbn = $hash{$i};
next;
}
if($m == 2 and $i eq 'r')
{
$sb = $tmp[2];
$sbn = $hash{$i};
last;
}elsif($m == 2){
$sb = $i;
$sbn = $hash{$i};
last;
}
}
print join "\t", @tmp[0..2], $fb, $fbn, $sb, $sbn, "\n";
}
close IN;
sub showtime
{
my ($x) = @_;
my $t = localtime;
print STDERR "[$t] $x\n";
}
尤其是没有过滤质量值,所以得到的reads肯定是不准确的,另外一个过滤版本由于某些经验上的判定原因暂不发布。