Anopheles refactor part 3 - AnophelesGenomeFeaturesData#382
Conversation
|
|
||
| GCS_URL = "gs://vo_afun_release/" | ||
|
|
||
| GENOME_FASTA_PATH = ( | ||
| "reference/genome/idAnoFuneDA-416_04/idAnoFuneDA-416_04_1.curated_primary.fa" | ||
| ) | ||
| GENOME_FAI_PATH = ( | ||
| "reference/genome/idAnoFuneDA-416_04/idAnoFuneDA-416_04_1.curated_primary.fa.fai" | ||
| ) | ||
| GENOME_ZARR_PATH = ( | ||
| "reference/genome/idAnoFuneDA-416_04/idAnoFuneDA-416_04_1.curated_primary.zarr" | ||
| ) | ||
| SITE_ANNOTATIONS_ZARR_PATH = "reference/genome/idAnoFuneDA-416_04/Anopheles-funestus-DA-416_04_1_SEQANNOTATION.zarr" | ||
| GENOME_REF_ID = "idAnoFuneDA-416_04" | ||
| GENOME_REF_NAME = "Anopheles funestus" | ||
|
|
||
| CONTIGS = "2RL", "3RL", "X" |
There was a problem hiding this comment.
This should have been removed in an earlier PR, no longer needed because provided via the release config file.
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #382 +/- ##
==========================================
+ Coverage 92.02% 95.00% +2.97%
==========================================
Files 2 3 +1
Lines 188 380 +192
==========================================
+ Hits 173 361 +188
- Misses 15 19 +4 ☔ View full report in Codecov by Sentry. |
| def genome_features( | ||
| self, region=None, attributes=("ID", "Parent", "Note", "description") | ||
| ): | ||
| """Access genome feature annotations. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| region: str or list of str or Region or list of Region | ||
| Contig name (e.g., "2RL"), gene name (e.g., "LOC125767311"), genomic | ||
| region defined with coordinates (e.g., "2RL:44,989,425-44,998,059") or a | ||
| named tuple with genomic location `Region(contig, start, end)`. | ||
| Multiple values can be provided as a list, in which case data will | ||
| be concatenated, e.g., ["2RL", "3RL"]. | ||
| attributes : list of str, optional | ||
| Attribute keys to unpack into columns. Provide "*" to unpack all | ||
| attributes. | ||
|
|
||
| Returns | ||
| ------- | ||
| df : pandas.DataFrame | ||
| A dataframe of genome annotations, one row per feature. | ||
|
|
||
| """ | ||
|
|
||
| # Here we override the superclass implementation in order to provide a | ||
| # different default value for the `attributes` parameter, because the | ||
| # genome annotations don't include a "Name" attribute but do include a | ||
| # "Note" attribute which is probably useful to include instead. | ||
| # | ||
| # Also, we take the opportunity to customise the docstring to use | ||
| # examples specific to funestus. | ||
| # | ||
| # See also https://github.com/malariagen/malariagen-data-python/issues/306 | ||
|
|
||
| return super().genome_features(region=region, attributes=attributes) |
There was a problem hiding this comment.
Removed because this override is not needed any more, the default attributes are now handled differently.
| def _plot_genes_setup_data(self, *, region): | ||
| # Here we override the superclass implementation because the | ||
| # gene annotations don't include a "Name" attribute. | ||
| # | ||
| # Also, the type needed is "protein_coding_gene". | ||
|
|
||
| df_genome_features = self.genome_features( | ||
| region=region, attributes=["ID", "Parent", "description"] | ||
| ) | ||
| data = df_genome_features.query("type == 'protein_coding_gene'").copy() | ||
|
|
||
| tooltips = [ | ||
| ("ID", "@ID"), | ||
| ("Description", "@description"), | ||
| ("Location", "@contig:@start{,}-@end{,}"), | ||
| ] | ||
|
|
||
| return data, tooltips |
There was a problem hiding this comment.
Override not needed any more, handled now in a general way in the parent class.
|
|
||
| GCS_URL = "gs://vo_agam_release/" | ||
|
|
||
| GENOME_FASTA_PATH = ( | ||
| "reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa" | ||
| ) | ||
| GENOME_FAI_PATH = ( | ||
| "reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa.fai" | ||
| ) | ||
| GENOME_ZARR_PATH = ( | ||
| "reference/genome/agamp4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.zarr" | ||
| ) | ||
| SITE_ANNOTATIONS_ZARR_PATH = ( | ||
| "reference/genome/agamp4/Anopheles-gambiae-PEST_SEQANNOTATION_AgamP4.12.zarr" | ||
| ) | ||
| GENOME_REF_ID = "AgamP4" | ||
| GENOME_REF_NAME = "Anopheles gambiae (PEST)" | ||
|
|
||
| CONTIGS = "2R", "2L", "3R", "3L", "X" |
There was a problem hiding this comment.
Should've been removed in an earlier PR.
There was a problem hiding this comment.
This is the new module with the AnophelesGenomeFeaturesData class.
| from .genome_sequence import AnophelesGenomeSequenceData | ||
|
|
||
|
|
||
| class gplt_params: |
There was a problem hiding this comment.
Moving these parameter definitions here, as this is the first time they're needed.
| return root | ||
|
|
||
|
|
||
| class Gff3Simulator: |
There was a problem hiding this comment.
This is a bit more complicated than I would've liked but tried to take the simplest route to generating some semi-realistic genome features data.
| major_version_number=_af1.MAJOR_VERSION_NUMBER, | ||
| major_version_path=_af1.MAJOR_VERSION_PATH, | ||
| pre=True, | ||
| pre=False, |
There was a problem hiding this comment.
Correcting, don't know how this got changed.
| def setup_subclass_cached(subclass, **kwargs): | ||
| if subclass == Ag3: | ||
| url = f"simplecache::{AG3_GCS_URL}" | ||
| elif subclass == Af1: | ||
| url = f"simplecache::{AF1_GCS_URL}" | ||
| else: | ||
| raise ValueError | ||
| return setup_subclass(subclass, url=url, **kwargs) |
There was a problem hiding this comment.
Had to rejig this a little because _gcs_url is no longer a class attribute.
| "subclass, sample_query, contig, analysis, sample_sets", | ||
| [ | ||
| (Ag3, "country == 'Ghana'", "3L", "gamb_colu", "3.0"), | ||
| (Af1, "country == 'Ghana'", "3RL", "funestus", "1.0"), | ||
| (Af1, "country == 'Ghana'", "X", "funestus", "1.0"), | ||
| ], | ||
| ) | ||
| @pytest.mark.parametrize( | ||
| "window_sizes", | ||
| [[100, 200, 500], [10000, 20000]], | ||
| ) | ||
| def test_h12_calibration( | ||
| subclass, sample_query, contig, analysis, sample_sets, window_sizes | ||
| ): | ||
| url = f"simplecache::{subclass._gcs_url}" | ||
| anoph = setup_subclass(subclass, url) | ||
| def test_h12_calibration(subclass, sample_query, contig, analysis, sample_sets): | ||
| anoph = setup_subclass_cached(subclass) | ||
|
|
||
| window_sizes = [10_000, 20_000] | ||
| calibration_runs = anoph.h12_calibration( |
There was a problem hiding this comment.
Took the opportunity to slim this test down, was running pretty slow.
| (Ag3, "country == 'Ghana'", "3L", "gamb_colu", "3.0"), | ||
| (Af1, "country == 'Ghana'", "3RL", "funestus", "1.0"), | ||
| (Af1, "country == 'Ghana'", "X", "funestus", "1.0"), | ||
| ], | ||
| ) | ||
| @pytest.mark.parametrize( | ||
| "window_sizes", | ||
| [[100, 200, 500], [10000, 20000]], | ||
| ) | ||
| def test_g123_calibration( | ||
| subclass, sample_query, contig, site_mask, sample_sets, window_sizes | ||
| ): | ||
| url = f"simplecache::{subclass._gcs_url}" | ||
| anoph = setup_subclass(subclass, url) | ||
| def test_g123_calibration(subclass, sample_query, contig, site_mask, sample_sets): | ||
| anoph = setup_subclass_cached(subclass) | ||
|
|
||
| window_sizes = [10_000, 20_000] | ||
| calibration_runs = anoph.g123_calibration( |
There was a problem hiding this comment.
Took the opportunity to slim this test down, was running pretty slow.
| with: | ||
| path: gcs_cache | ||
| key: gcs_cache_notebooks | ||
| key: gcs_cache_notebooks_20230405 |
There was a problem hiding this comment.
Clearing GCS cache because the release config files have changed.
|
Check out this pull request on See visual diffs & provide feedback on Jupyter Notebooks. Powered by ReviewNB |
|
Thanks @sanjaynagi. I've pushed a couple more commits to deal with #334 where some GFFs have exon records with multiple parent transcripts. Also some general tidying up. Think I'm done here, planning to merge if CI is happy. |

Here we continue towards #366 by pulling out functions for accessing and plotting genome features into a new AnophelesGenomeFeaturesData class.
Also resolves #334 along the way.