-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic-summary-stats.pl
More file actions
executable file
·125 lines (106 loc) · 3.99 KB
/
basic-summary-stats.pl
File metadata and controls
executable file
·125 lines (106 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/perl
use strict;
use ProgramName;
use EssexParser;
use EssexICE;
$|=1;
my $name=ProgramName::get();
die "$name <in.essex>\n" unless @ARGV==1;
my ($infile)=@ARGV;
my (%tooManyErrors,%badAnnotation,%NMD,%prematureStop,%startCodonChange,
%splicingChanges,%frameshift,%annotationOK,%brokenDonor,%brokenAcceptor,
%proteinDiffers,%EJC,%newStart,%newStartNMD);
my $parser=new EssexParser($infile);
while(1) {
my $root=$parser->nextElem();
last unless $root;
my $ice=new EssexICE($root);
my $transcriptID=$ice->getTranscriptID();
my $geneID=$ice->getGeneID();
my $status=$root->findChild("status");
my $statusString=$ice->getStatusString();
if($statusString eq "mapped") { # mapped: includes too-many-vcf-errors
++$annotationOK{$geneID};
if($status->hasDescendentOrDatum("too-many-vcf-errors")) {
++$tooManyErrors{$geneID};
next }
if($status->hasDescendentOrDatum("new-upstream-start-codon")) {
++$newStart{$geneID};
my $newStart=$status->findDescendent("new-upstream-start-codon");
if($newStart->hasDescendentOrDatum("NMD")) { ++$newStartNMD{$geneID} }
}
if($status->hasDescendentOrDatum("NMD") &&
!$status->hasDescendentOrDatum("new-upstream-start-codon")) {
++$NMD{$geneID};
my $premature=$status->findChild("premature-stop");
#die unless $premature;
if(!$premature) { $status->print(\*STDERR); die }
my $dist=$premature->getAttribute("EJC-distance");
++$EJC{$geneID}->{$dist};
}
if($status->hasDescendentOrDatum("frameshift")) { ++$frameshift{$geneID} }
if($status->hasDescendentOrDatum("premature-stop"))
{ ++$prematureStop{$geneID} }
if($status->hasDescendentOrDatum("start-codon-change"))
{ ++$startCodonChange{$geneID} }
if($status->hasDescendentOrDatum("protein-differs"))
{ ++$proteinDiffers{$geneID} }
}
else { # splicing-changes/no-transcript/bad-annotation
if($statusString eq "bad-annotation") { ++$badAnnotation{$geneID} }
else { ++$annotationOK{$geneID} }
if($statusString eq "splicing-changes") {
++$splicingChanges{$geneID};
if($status->hasDescendentOrDatum("broken-donor"))
{ ++$brokenDonor{$geneID} }
if($status->hasDescendentOrDatum("broken-acceptor"))
{ ++$brokenAcceptor{$geneID} }
}
}
}
$parser->close();
# Too many VCF errors
my $errors=keys %tooManyErrors;
print "$errors genes had too many VCF errors\n";
# Bad annotation
my $badAnnos=keys %badAnnotation;
print "$badAnnos genes had bad annotations\n";
# Genes with at least one transcript suffering NMD
my $nmdGenes=keys %NMD;
print "$nmdGenes mapped genes had NMD\n";
# Premature stop
my $numPrematureStop=keys %prematureStop;
print "$numPrematureStop mapped genes had a premature stop\n";
# Start codon change
my $numStartChange=keys %startCodonChange;
print "$numStartChange mapped genes had a change to the start codon\n";
# Splicing changes
my $numSplicingChanges=keys %splicingChanges;
print "$numSplicingChanges genes had splicing changes\n";
# Frameshift
my $numFrameshift=keys %frameshift;
print "$numFrameshift mapped genes had a frameshift indel\n";
# Annotation is OK
my $annoOK=keys %annotationOK;
print "$annoOK genes had a valid annotation\n";
# Broken donor site
my $brokenDonor=keys %brokenDonor;
print "$brokenDonor genes had at broken donor site\n";
# Broken acceptor site
my $brokenAcceptor=keys %brokenAcceptor;
print "$brokenAcceptor genes had a broken acceptor site\n";
# Protein differs
my $proteinDiffers=keys %proteinDiffers;
print "$proteinDiffers genes had a mapped transcript whose protein changed\n";
# New upstream start codon
my $newStart=keys %newStart;
print "$newStart genes had a new upstream start codon\n";
my $newStartNMD=keys %newStartNMD;
print "$newStartNMD genes had a new upstream start codon predicted to cause NMD\n";
# Distance of stop codon to EJC when there's NMD
my @genes=keys %EJC;
foreach my $gene (@genes) {
my @distances=keys %{$EJC{$gene}};
foreach my $dist (@distances) { print "EJC_DISTANCE\t$dist\n" }
}
print STDERR "[done]\n";