Annotating alignments and sequences¶
Section author: Peter Maxwell, Gavin Huttley
A Sequence with a couple of exons on it.
from cogent3 import DNA
from cogent3.core.annotation import Feature
s = DNA.make_seq("AAGAAGAAGACCCCCAAAAAAAAAATTTTTTTTTTAAAAAAAAAAAAA", name="Orig")
exon1 = s.add_annotation(Feature, "exon", "fred", [(10, 15)])
exon2 = s.add_annotation(Feature, "exon", "trev", [(30, 40)])
The corresponding sequence can be extracted either with slice notation or by asking the feature to do it, since the feature knows what sequence it belongs to.
s[exon1]
exon1.get_slice()
0 | |
Orig | CCCCC |
5 DnaSequence
Usually the only way to get a Feature
object like exon1
is to ask the sequence for it. There is one method for querying annotations by type and optionally by name:
exons = s.get_annotations_matching("exon")
print(exons)
[exon "fred" at [10:15]/48, exon "trev" at [30:40]/48]
If the sequence does not have a matching feature you get back an empty list, and slicing the sequence with that returns a sequence of length 0.
dont_exist = s.get_annotations_matching("dont_exist")
dont_exist
s[dont_exist]
0 DnaSequence
To construct a pseudo-feature covering (or excluding) multiple features, use get_region_covering_all
:
print(s.get_region_covering_all(exons))
print(s.get_region_covering_all(exons).get_shadow())
region "exon" at [10:15, 30:40]/48
region "not exon" at [0:10, 15:30, 40:48]/48
eg: all the exon sequence:
s.get_region_covering_all(exons).get_slice()
0 | |
Orig | CCCCCTTTTTAAAAA |
15 DnaSequence
or with slice notation:
s[exon1, exon2]
0 | |
Orig | CCCCCTTTTTAAAAA |
15 DnaSequence
Though .get_region_covering_all
also guarantees no overlaps within the result, slicing does not:
print(s.get_region_covering_all(exons + exons))
s[exon1, exon1, exon1, exon1, exon1]
region "exon" at [10:15, 30:40]/48
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [8], in <module>
1 print(s.get_region_covering_all(exons + exons))
----> 2 s[exon1, exon1, exon1, exon1, exon1]
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:90, in _Annotatable.__getitem__(self, index)
88 map = self._as_map(index)
89 new = self._mapped(map)
---> 90 sliced_annots = self._sliced_annotations(new, map)
91 new.attach_annotations(sliced_annots)
92 if hasattr(self, "_repr_policy"):
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:36, in _Annotatable._sliced_annotations(self, new, slice)
34 slicemap = self._as_map(slice)
35 # try:
---> 36 newmap = slicemap.inverse()
37 # except ValueError, detail:
38 # print "Annotations dropped because %s" % detail
39 # return []
40 if slicemap.useful:
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/location.py:754, in Map.inverse(self)
752 def inverse(self):
753 if self.__inverse is None:
--> 754 self.__inverse = self._inverse()
755 return self.__inverse
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/location.py:779, in Map._inverse(self)
777 new_spans.append(LostSpan(lo - last_hi))
778 elif lo < last_hi:
--> 779 raise ValueError(f"Uninvertable. Overlap: {lo} < {last_hi}")
780 new_spans.append(Span(start, end, reverse=start > end))
781 last_hi = hi
ValueError: Uninvertable. Overlap: 10 < 15
You can use features, maps, slices or integers, but non-monotonic slices are not allowed:
s[15:20, 5:16]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [9], in <module>
----> 1 s[15:20, 5:16]
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:90, in _Annotatable.__getitem__(self, index)
88 map = self._as_map(index)
89 new = self._mapped(map)
---> 90 sliced_annots = self._sliced_annotations(new, map)
91 new.attach_annotations(sliced_annots)
92 if hasattr(self, "_repr_policy"):
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:36, in _Annotatable._sliced_annotations(self, new, slice)
34 slicemap = self._as_map(slice)
35 # try:
---> 36 newmap = slicemap.inverse()
37 # except ValueError, detail:
38 # print "Annotations dropped because %s" % detail
39 # return []
40 if slicemap.useful:
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/location.py:754, in Map.inverse(self)
752 def inverse(self):
753 if self.__inverse is None:
--> 754 self.__inverse = self._inverse()
755 return self.__inverse
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/location.py:779, in Map._inverse(self)
777 new_spans.append(LostSpan(lo - last_hi))
778 elif lo < last_hi:
--> 779 raise ValueError(f"Uninvertable. Overlap: {lo} < {last_hi}")
780 new_spans.append(Span(start, end, reverse=start > end))
781 last_hi = hi
ValueError: Uninvertable. Overlap: 15 < 16
Features are themselves sliceable:
exon1[0:3].get_slice()
0 | |
Orig | CCC |
3 DnaSequence
When sequences are concatenated they keep their (non-overlapping) annotations:
c = s[exon1[4:]] + s
print(len(c))
for feat in c.annotations:
print(feat)
49
exon "fred" at [-4-, 0:1]/49
exon "fred" at [11:16]/49
exon "trev" at [31:41]/49
Since features know their parents you can’t use a feature from one sequence to slice another:
print(c[exon1])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [12], in <module>
----> 1 print(c[exon1])
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:88, in _Annotatable.__getitem__(self, index)
87 def __getitem__(self, index):
---> 88 map = self._as_map(index)
89 new = self._mapped(map)
90 sliced_annots = self._sliced_annotations(new, map)
File ~/opt/miniconda3/envs/c310dev/lib/python3.10/site-packages/cogent3/core/annotation.py:78, in _Annotatable._as_map(self, index)
76 base = base.parent
77 if base is not self:
---> 78 raise ValueError(
79 f"Can't map {index} onto {repr(self)} via {containers}"
80 )
81 for base in containers:
82 feature = feature.remapped_to(base, base.map)
ValueError: Can't map exon "fred" at [10:15]/48 onto DnaSequence(CAAGAAG... 49) via []
Features are generally attached to the thing they annotate, but in those cases where a free-floating feature is created it can later be attached:
len(s.annotations)
region = s.get_region_covering_all(exons)
len(s.annotations)
region.attach()
len(s.annotations)
region.detach()
len(s.annotations)
2
When dealing with sequences that can be reverse complemented (e.g. DnaSequence
) features are not reversed. Features are considered to have strand specific meaning (.e.g CDS, exons) and so stay on their original strands. We create a sequence with a CDS that spans multiple exons, and show that after getting the reverse complement we have exactly the same result from getting the CDS annotation.
plus = DNA.make_seq("AAGGGGAAAACCCCCAAAAAAAAAATTTTTTTTTTAAA", name="plus")
plus_cds = plus.add_annotation(Feature, "CDS", "gene", [(2, 6), (10, 15), (25, 35)])
print(plus_cds.get_slice())
minus = plus.rc()
minus_cds = minus.get_annotations_matching("CDS")[0]
print(minus_cds.get_slice())
GGGGCCCCCTTTTTTTTTT
GGGGCCCCCTTTTTTTTTT
Sequence features can be accessed via a containing Alignment
:
from cogent3 import make_aligned_seqs
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False
)
print(aln)
exon = aln.get_seq("x").add_annotation(Feature, "exon", "fred", [(3, 8)])
aln_exons = aln.get_annotations_from_seq("x", "exon")
aln_exons = aln.get_annotations_from_any_seq("exon")
>x
-AAAAAAAAA
>y
TTTT--TTTT
But these will be returned as alignment features with locations in alignment coordinates.
print(exon)
print(aln_exons[0])
print(aln_exons[0].get_slice())
aln_exons[0].attach()
len(aln.annotations)
exon "fred" at [3:8]/9
exon "fred" at [4:9]/10
>x
AAAAA
>y
--TTT
1
Similarly alignment features can be projected onto the aligned sequences, where they may end up falling across gaps:
exons = aln.get_projected_annotations("y", "exon")
print(exons)
print(aln.get_seq("y")[exons[0].map.without_gaps()])
[exon "fred" at [-2-, 4:7]/8]
TTT
We copy the annotations from another sequence,
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "TTTT--CCCC"]], array_align=False
)
s = DNA.make_seq("AAAAAAAAA", name="x")
exon = s.add_annotation(Feature, "exon", "fred", [(3, 8)])
exon = aln.get_seq("x").copy_annotations(s)
aln_exons = list(aln.get_annotations_from_seq("x", "exon"))
print(aln_exons)
[exon "fred" at [4:9]/10]
even if the name is different.
exon = aln.get_seq("y").copy_annotations(s)
aln_exons = list(aln.get_annotations_from_seq("y", "exon"))
print(aln_exons)
print(aln[aln_exons])
[exon "fred" at [3:4, 6:10]/10]
>x
AAAAA
>y
TCCCC
If the feature lies outside the sequence being copied to, you get a lost span
aln = make_aligned_seqs([["x", "-AAAA"], ["y", "TTTTT"]], array_align=False)
seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCC", "x")
exon = seq.add_feature("exon", "A", [(5, 8)])
aln.get_seq("x").copy_annotations(seq)
copied = list(aln.get_annotations_from_seq("x", "exon"))
copied
copied[0].get_slice()
0 | |
x | ---- |
y | .... |
2 x 4 text alignment
You can copy to a sequence with a different name, in a different alignment if the feature lies within the length
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False
)
seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCC", "x")
match_exon = seq.add_feature("exon", "A", [(5, 8)])
aln.get_seq("y").copy_annotations(seq)
copied = list(aln.get_annotations_from_seq("y", "exon"))
copied
[exon "A" at [7:10]/10]
If the sequence is shorter, again you get a lost span.
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False
)
diff_len_seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCCCCCCCCCC", "x")
nonmatch = diff_len_seq.add_feature("repeat", "A", [(12, 14)])
aln.get_seq("y").copy_annotations(diff_len_seq)
copied = list(aln.get_annotations_from_seq("y", "repeat"))
copied
[repeat "A" at [10:10, -6-]/10]
We consider cases where there are terminal gaps.
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "------TTTT"]], array_align=False
)
exon = aln.get_seq("x").add_feature("exon", "fred", [(3, 8)])
aln_exons = list(aln.get_annotations_from_seq("x", "exon"))
print(aln_exons)
print(aln_exons[0].get_slice())
aln = make_aligned_seqs(
[["x", "-AAAAAAAAA"], ["y", "TTTT--T---"]], array_align=False
)
exon = aln.get_seq("x").add_feature("exon", "fred", [(3, 8)])
aln_exons = list(aln.get_annotations_from_seq("x", "exon"))
print(aln_exons[0].get_slice())
[exon "fred" at [4:9]/10]
>x
AAAAA
>y
--TTT
>x
AAAAA
>y
--T--
In this case, only those residues included within the feature are covered - note the omission of the T in y
opposite the gap in x
.
aln = make_aligned_seqs(
[["x", "C-CCCAAAAA"], ["y", "-T----TTTT"]], moltype="dna", array_align=False
)
print(aln)
exon = aln.get_seq("x").add_feature("exon", "ex1", [(0, 4)])
print(exon)
print(exon.get_slice())
aln_exons = list(aln.get_annotations_from_seq("x", "exon"))
print(aln_exons)
print(aln_exons[0].get_slice())
>x
C-CCCAAAAA
>y
-T----TTTT
exon "ex1" at [0:4]/9
CCCC
[exon "ex1" at [0:1, 2:5]/10]
>x
CCCC
>y
----
Feature.as_one_span()
, is applied to the exon that straddles the gap in x
. The result is we preserve that feature.
print(aln_exons[0].as_one_span().get_slice())
>x
C-CCC
>y
-T---
These properties also are consistently replicated with reverse complemented sequences.
aln_rc = aln.rc()
rc_exons = list(aln_rc.get_annotations_from_any_seq("exon"))
print(aln_rc[rc_exons]) # not using as_one_span, so gap removed from x
print(aln_rc[rc_exons[0].as_one_span()])
>x
CCCC
>y
----
>x
C-CCC
>y
-T---
Features can provide their coordinates, useful for custom analyses.
all_exons = aln.get_region_covering_all(aln_exons)
coords = all_exons.get_coordinates()
assert coords == [(0, 1), (2, 5)]
Annotated regions can be masked (observed sequence characters replaced by another), either through the sequence on which they reside or by projection from the alignment. Note that mask_char
must be a valid character for the sequence MolType
. Either the features (multiple can be named), or their shadow, can be masked.
We create an alignment with a sequence that has two different annotation types.
aln = make_aligned_seqs(
[["x", "C-CCCAAAAAGGGAA"], ["y", "-T----TTTTG-GTT"]], array_align=False
)
print(aln)
exon = aln.get_seq("x").add_feature("exon", "norwegian", [(0, 4)])
print(exon.get_slice())
repeat = aln.get_seq("x").add_feature("repeat", "blue", [(9, 12)])
print(repeat.get_slice())
repeat = aln.get_seq("y").add_feature("repeat", "frog", [(5, 7)])
print(repeat.get_slice())
>x
C-CCCAAAAAGGGAA
>y
-T----TTTTG-GTT
CCCC
GGG
GG
Each sequence should correctly mask either the single feature, it’s shadow, or the multiple features, or shadow.
print(aln.get_seq("x").with_masked_annotations("exon", mask_char="?"))
print(aln.get_seq("x").with_masked_annotations("exon", mask_char="?", shadow=True))
print(aln.get_seq("x").with_masked_annotations(["exon", "repeat"], mask_char="?"))
print(
aln.get_seq("x").with_masked_annotations(
["exon", "repeat"], mask_char="?", shadow=True
)
)
print(aln.get_seq("y").with_masked_annotations("exon", mask_char="?"))
print(aln.get_seq("y").with_masked_annotations("repeat", mask_char="?"))
print(
aln.get_seq("y").with_masked_annotations("repeat", mask_char="?", shadow=True)
)
????AAAAAGGGAA
CCCC??????????
????AAAAA???AA
CCCC?????GGG??
TTTTTGGTT
TTTTT??TT
?????GG??
The same methods can be applied to annotated Alignment’s.
print(aln.with_masked_annotations("exon", mask_char="?"))
print(aln.with_masked_annotations("exon", mask_char="?", shadow=True))
print(aln.with_masked_annotations("repeat", mask_char="?"))
print(aln.with_masked_annotations("repeat", mask_char="?", shadow=True))
print(aln.with_masked_annotations(["repeat", "exon"], mask_char="?"))
print(aln.with_masked_annotations(["repeat", "exon"], shadow=True))
>x
?-???AAAAAGGGAA
>y
-T----TTTTG-GTT
>x
C-CCC??????????
>y
-?----?????-???
>x
C-CCCAAAAA???AA
>y
-T----TTTT?-?TT
>x
?-????????GGG??
>y
-?----????G-G??
>x
?-???AAAAA???AA
>y
-T----TTTT?-?TT
>x
C-CCC?????GGG??
>y
-?----????G-G??
It shouldn’t matter whether annotated coordinates are entered separately, or as a series.
data = [["human", "CGAAACGTTT"], ["mouse", "CTAAACGTCG"]]
as_series = make_aligned_seqs(data, array_align=False)
as_items = make_aligned_seqs(data, array_align=False)
We add annotations to the sequences as a series.
as_series.get_seq("human").add_feature("cpgsite", "cpg", [(0, 2), (5, 7)])
as_series.get_seq("mouse").add_feature("cpgsite", "cpg", [(5, 7), (8, 10)])
cpgsite "cpg" at [5:7, 8:10]/10
We add the annotations to the sequences one segment at a time.
as_items.get_seq("human").add_feature("cpgsite", "cpg", [(0, 2)])
as_items.get_seq("human").add_feature("cpgsite", "cpg", [(5, 7)])
as_items.get_seq("mouse").add_feature("cpgsite", "cpg", [(5, 7)])
as_items.get_seq("mouse").add_feature("cpgsite", "cpg", [(8, 10)])
cpgsite "cpg" at [8:10]/10
These different constructions should generate the same output.
serial = as_series.with_masked_annotations(["cpgsite"])
print(serial)
itemwise = as_items.with_masked_annotations(["cpgsite"])
print(itemwise)
>human
??AAA??TTT
>mouse
CTAAA??T??
>human
??AAA??TTT
>mouse
CTAAA??T??
Annotations should be correctly masked, whether the sequence has been reverse complemented or not. We use the plus/minus strand CDS containing sequences created above.
print(plus.with_masked_annotations("CDS"))
print(minus.with_masked_annotations("CDS"))
AA????AAAA?????AAAAAAAAAA??????????AAA
TTT??????????TTTTTTTTTT?????TTTT????TT