1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#![allow(clippy::identity_op)]
use std::fmt;
use crate::amino_acid::{translate, AminoAcid};
use crate::cds::CDS;
use crate::interval::Interval;
use crate::sequence_annotation::{SeqAnnotation, Strand};
use crate::MutationType;
/// Ensure that modulo also works correctly for negative numbers
fn mod3(i: isize) -> usize {
i.rem_euclid(3) as usize
}
fn complement(nuc: char) -> char {
match nuc {
'A' => 'T',
'C' => 'G',
'G' => 'C',
'T' => 'A',
_ => panic!("Unsupported nucleotide {}", nuc),
}
}
pub struct PointMutationClassifier<'a> {
gene_info: &'a SeqAnnotation,
middle_index: usize,
}
impl<'a> PointMutationClassifier<'a> {
pub fn new(gene_info: &'a SeqAnnotation, middle_index: usize) -> Self {
Self {
gene_info,
middle_index,
}
}
/// Classify the effect of a point mutation
///
/// Parameters:
/// `genomic_position`: The orginal genomic position on a chromosome where the mutation takes place
/// `seq`: The reference sequence around the mutation site, with the mutated locus in the middle
/// `other_nucleotide`: The nucleotide that will replace the old nucleotide
/// `cds_index_hint`: (optional) The internal array-index of the CDS that this mutation falls into. If you provide this, and it is correct, no search for a matching CDS needs to be done.
///
/// Returns: The corresponding `MutationType`
pub fn classify_by_position(
&self,
genomic_position: usize,
seq: &[char],
intron: &Option<Interval>,
) -> MutationType {
if intron.is_some() {
if self.disrupts_cannonical_splice_site(genomic_position, seq, &intron.expect("is some")) {
MutationType::SpliceSite
} else {
MutationType::Intronic
}
} else if self.disrupts_start_codon(genomic_position, seq) {
MutationType::StartCodon
} else {
MutationType::Unknown
}
}
/// Returns `true` if the point mutation disrupts a cannonical splice site.
/// A cannonical splice site is a left "GT" or a right "AG" at the ends of
/// an intron.
///
/// There are basically 4 locations of interest for this test:
///
/// 12 34
/// ...GT....AG...
/// ^ ^
/// |intron|
///
/// If any of these four bases are mutated, return True. Also take into
/// account the reverse complement:
///
/// |intron|
/// v v
/// ...GT....AG...
/// ...CA....TC...
/// 43 21 <- reverse complement tests case indexes
///
fn disrupts_cannonical_splice_site(
&self,
genomic_position: usize,
seq: &[char],
intron: &Interval,
) -> bool {
let middle_index = self.middle_index;
let start = intron.start;
let stop = intron.stop;
match self.gene_info.strand {
Strand::Plus => {
// 5' to 3' direction
if genomic_position == start {
// check for [G]T
seq[middle_index] == 'G' && seq[middle_index + 1] == 'T'
} else if genomic_position == start + 1 {
// check for G[T]
seq[middle_index - 1] == 'G' && seq[middle_index] == 'T'
} else if genomic_position == stop - 2 {
// check for [A]G
seq[middle_index] == 'A' && seq[middle_index + 1] == 'G'
} else if genomic_position == stop - 1 {
// check for A[G]
seq[middle_index - 1] == 'A' && seq[middle_index] == 'G'
} else {
false // not a canonical splice site
}
}
Strand::Minus => {
// Because the reference sequence is on the plus strand, we need to check for the reverse complement
if genomic_position == start {
// check for [C]T
seq[middle_index] == 'C' && seq[middle_index + 1] == 'T'
} else if genomic_position == start + 1 {
// check for C[T]
seq[middle_index - 1] == 'C' && seq[middle_index - 0] == 'T'
} else if genomic_position == stop - 2 {
// check for [A]C
seq[middle_index] == 'A' && seq[middle_index + 1] == 'C'
} else if genomic_position == stop - 1 {
// check for A[C]
seq[middle_index - 1] == 'A' && seq[middle_index - 0] == 'C'
} else {
false // not a canonical splice site
}
}
}
}
/// Check if a mutation at this position would disrupt the first codon
///
/// Parameters:
/// genomic_position: The position on the chromosome
fn disrupts_start_codon(&self, genomic_position: usize, _sequence: &[char]) -> bool {
let gene = self.gene_info;
match gene.strand {
Strand::Plus => {
if let Some(cds) = gene.coding_sequences.first() {
// the first CDS usually starts with the start codon
if let Some(distance_to_start_codon) =
genomic_position.checked_sub(cds.range.start)
{
/* //too much output for now
if distance_to_start_codon == 0 { // then we do a sanity check
let flanking: usize = _sequence.len() / 2;
if _sequence[ flanking .. flanking + 3 ] != [ 'A', 'T', 'G' ] {
eprintln!( "[WARNING] Lowest CDS on the + strand does not have a proper start codon. This can be correct for some genes" );
}
}
*/
return distance_to_start_codon < 3; // usize is always >= 0
}
}
}
Strand::Minus => {
if let Some(cds) = gene.coding_sequences.last() {
// the last CDS must end with a reverse start codon
if let Some(distance_to_start_codon) =
cds.range.stop.checked_sub(genomic_position)
{
/* //too much output for now
if distance_to_start_codon == 3 {
let flanking: usize = _sequence.len() / 2;
if _sequence[ flanking .. flanking + 3 ] != [ 'C', 'A', 'T' ] {
eprintln!( "[WARNING] Highest CDS on the - strand does not have a proper start codon. This can be correct for some genes" );
}
}
*/
return 0 < distance_to_start_codon && distance_to_start_codon < 4;
// keep in mind that the .stop is exclusive
}
}
}
}
false
}
pub fn classify_coding_mutation(
&self,
genomic_position: usize,
seq: &[char],
other_nucleotide: char,
cds: &CDS,
filter_plof: bool,
) -> MutationType {
let phase = cds.phase as usize;
let middle_index = self.middle_index;
// We ignore codons that are not contained in a single exon but are split
// between two.
// If we want to handle split codons better we should skip this match statement.
// Better solution would be to carry unused nucleotides from previous cds in cds struct
match self.gene_info.strand {
Strand::Plus => {
if genomic_position < cds.range.start + cds.phase as usize {
return MutationType::Unknown
} else if genomic_position >= cds.range.stop - ((cds.range.stop - (cds.range.start + cds.phase as usize))%3) {
return MutationType::Unknown
}
}
Strand::Minus => {
if genomic_position < cds.range.start + ((cds.range.stop - (cds.range.start + cds.phase as usize))%3) {
return MutationType::Unknown
} else if genomic_position >= cds.range.stop - cds.phase as usize {
return MutationType::Unknown
}
}
}
let (old_codon, relative_frame) = match self.gene_info.strand {
Strand::Plus => {
// easy mode
// The relative frame is the coding frame of the `genomic_position`
let relative_frame = mod3(
// this is ugly.
genomic_position as isize - cds.range.start as isize - phase as isize,
); // I need to establish a baseline for the frames. That baseline is (cds.range.start + phase). Then I need to calculate the distance from the baseline to the mutation
debug_assert!(relative_frame < 3, "relative_frame = {}", relative_frame);
let pos_of_codon_start = middle_index - relative_frame;
let nucs: String = seq[pos_of_codon_start..pos_of_codon_start + 3]
.iter()
.collect();
debug_assert_eq!(nucs.len(), 3);
(nucs, relative_frame)
}
Strand::Minus => {
// upside down
//I want to have the following properties for the frame:
//relative_frame=0 => first base of codon
//relative_frame=1 => second base of codon
//relative_frame=2 => third base of codon
//
//This means that the frames are as follows along the sequence:
//
// plus strand: 5' non-coding 3'
// minus strand: 3' 210 210 210 210 5'
//
// And we also process the CDS from the 5' to the 3' on the minus strand
//
// Example for a frame calculation:
//
// [210][210][2$X] reverse strand frames ($=CDS end position)
// 123 456 789 genomic positons
// ^
// |--- last inclusive position. So we have a phase of 1.
//
// Let's say we are at position 4. Then we need to know the
// distance to the end of the cds which is (8-1)-4=3. Then we
// have to take into account the phase which in this case is 1
// at $, so we have 3-1 = 2. And 2 mod 3 is 2.
//
// Another example:
//
// [210][21$]
// 123 456
// ^
// |--- last inclusive position. We have a phase of 2
//
// Let's say we are at position 1. Then the distance is
// (6-1)-1 = 4. We subtract the phase of 2 which gives us 2.
// 2 mod 3 is 2.
// Let's say we are at position 5. Then we calculate the distance
// (6-1)-5 = 0. We subtract the phase 2 so we are at -2.
// -2 mod 3 is 1.
let relative_frame = mod3(
cds.range.stop as isize
-1 // because the end position is exclusive
- phase as isize // move to a position that is at a codon start (on the reverse strand)
- genomic_position as isize,
); // get the distance and do mod3
debug_assert!(relative_frame < 3, "relative_frame = {}", relative_frame);
let mut nucs = String::with_capacity(3);
//let pos_of_codon_start = middle_index + relative_frame;
let codon_range = match relative_frame {
// v--- because end is eclusive
0 => (middle_index - 2..middle_index + 0 + 1),
1 => (middle_index - 1..middle_index + 1 + 1),
2 => (middle_index - 0..middle_index + 2 + 1),
_ => panic!("Modulo arithmethic is broken"),
};
for pos in codon_range.rev() {
//note the .rev() because we want to treat the final codon "normally"
nucs.push(complement(seq[pos]))
}
(nucs, relative_frame)
}
};
let new_codon = {
let mut codon = String::with_capacity(3);
for (i, c) in old_codon.chars().enumerate() {
codon.push(if i == relative_frame {
match self.gene_info.strand {
Strand::Plus => other_nucleotide,
Strand::Minus => complement(other_nucleotide),
}
} else {
c // same as old codon
})
}
codon
};
assert_eq!(old_codon.len(), 3);
assert_eq!(new_codon.len(), 3);
let old_aa = translate(&old_codon).unwrap();
let new_aa = translate(&new_codon).unwrap();
//let cds_end self.gene_info.coding_sequences.last().unwrap().range.stop
//println!("{:?} {:?}", self.gene_info.coding_sequences, self.gene_info.coding_sequences.last().unwrap().range.stop);
//println!("{:?} {:?} {:?} {:?}", old_codon, new_codon, old_aa, new_aa);
if old_aa == new_aa {
MutationType::Synonymous
} else if old_aa == AminoAcid::Stop {
MutationType::StopLoss
} else if new_aa == AminoAcid::Stop {
if filter_plof {
match self.gene_info.strand {
Strand::Plus => {
if genomic_position >= self.gene_info.get_end_minus50() {
MutationType::Unknown
} else {
MutationType::Nonsense
}
},
Strand::Minus => {
if genomic_position <= self.gene_info.get_end_minus50() {
MutationType::Unknown
} else {
MutationType::Nonsense
}
},
}
} else {
MutationType::Nonsense
}
} else {
MutationType::Missense
}
}
}
#[derive(Debug)]
pub struct SetupError {
filename: String,
message: String,
}
impl fmt::Display for SetupError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}: {}", self.filename, self.message)
}
}