-
Notifications
You must be signed in to change notification settings - Fork 14
batch saving genomes #212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
batch saving genomes #212
Changes from 69 commits
6208a9a
a2fabf4
330d6c2
27158ec
04aa203
8d672d1
d1dd768
e79c297
35030a0
4b1b88b
b56d77f
788c7ee
65467ea
66af0b7
fcbb510
32f96a0
41eed0b
71670bc
0cbe155
52e280d
f018f0e
54fedf0
861a50b
01a1dc8
654a5e9
bda89a0
6bb1009
2f546ac
0d192f7
a8bced6
b690b09
c1018df
6f5de92
bd8ea82
787c7b9
828edb6
595c78b
8992544
306ae55
cf72c07
f2b8f99
cc5cee1
d5ae8a3
7022f35
be96213
f624b11
7fb8355
79b44fd
6f64bbd
a3eaec2
f22ca6a
19ccfb5
c306fc6
948e3b1
b626756
5378631
ef95de7
fafc886
e7d0e04
0ae6e9d
6f0f0f7
6abafd6
ab8b829
f5f6aee
d360fda
e30e0e1
8dc7b37
1571b43
ed57557
2eb63c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,7 +69,7 @@ class GenomeFileUtil: | |
######################################### noqa | ||
VERSION = "0.11.7" | ||
GIT_URL = "[email protected]:kbaseapps/GenomeFileUtil.git" | ||
GIT_COMMIT_HASH = "591a19ccf4d1b42f01cc06486654b6d3a8ea08e4" | ||
GIT_COMMIT_HASH = "861a50b5ce2d0b8b481fe514b41a6d6b89ee0c96" | ||
|
||
#BEGIN_CLASS_HEADER | ||
#END_CLASS_HEADER | ||
|
@@ -831,27 +831,27 @@ def fasta_gff_to_metagenome(self, ctx, params): | |
def save_one_genome(self, ctx, params): | ||
""" | ||
:param params: instance of type "SaveOneGenomeParams" -> structure: | ||
parameter "workspace" of String, parameter "name" of String, | ||
parameter "data" of type "Genome" (Genome type -- annotated and | ||
assembled genome data. Field descriptions: id - string - KBase | ||
legacy data ID scientific_name - string - human readable species | ||
name domain - string - human readable phylogenetic domain name | ||
(eg. "Bacteria") warnings - list of string - genome-level warnings | ||
generated in the annotation process genome_tiers - list of string | ||
- controlled vocabulary (based on app input and checked by | ||
GenomeFileUtil) A list of labels describing the data source for | ||
this genome. Allowed values - Representative, Reference, | ||
ExternalDB, User Tier assignments based on genome source: * All | ||
phytozome - Representative and ExternalDB * Phytozome flagship | ||
genomes - Reference, Representative and ExternalDB * Ensembl - | ||
Representative and ExternalDB * RefSeq Reference - Reference, | ||
Representative and ExternalDB * RefSeq Representative - | ||
Representative and ExternalDB * RefSeq Latest or All Assemblies | ||
folder - ExternalDB * User Data - User tagged feature_counts - map | ||
of string to integer - total counts of each type of feature keys | ||
are a controlled vocabulary of - "CDS", "gene", "misc_feature", | ||
"misc_recomb", "mobile_element", "ncRNA" - 72, | ||
"non_coding_features", "non_coding_genes", | ||
parameter "workspace_id" of Long, parameter "workspace" of String, | ||
parameter "name" of String, parameter "data" of type "Genome" | ||
(Genome type -- annotated and assembled genome data. Field | ||
descriptions: id - string - KBase legacy data ID scientific_name - | ||
string - human readable species name domain - string - human | ||
readable phylogenetic domain name (eg. "Bacteria") warnings - list | ||
of string - genome-level warnings generated in the annotation | ||
process genome_tiers - list of string - controlled vocabulary | ||
(based on app input and checked by GenomeFileUtil) A list of | ||
labels describing the data source for this genome. Allowed values | ||
- Representative, Reference, ExternalDB, User Tier assignments | ||
based on genome source: * All phytozome - Representative and | ||
ExternalDB * Phytozome flagship genomes - Reference, | ||
Representative and ExternalDB * Ensembl - Representative and | ||
ExternalDB * RefSeq Reference - Reference, Representative and | ||
ExternalDB * RefSeq Representative - Representative and ExternalDB | ||
* RefSeq Latest or All Assemblies folder - ExternalDB * User Data | ||
- User tagged feature_counts - map of string to integer - total | ||
counts of each type of feature keys are a controlled vocabulary of | ||
- "CDS", "gene", "misc_feature", "misc_recomb", "mobile_element", | ||
"ncRNA" - 72, "non_coding_features", "non_coding_genes", | ||
"protein_encoding_gene", "rRNA", "rep_origin", "repeat_region", | ||
"tRNA" genetic_code - int - An NCBI-assigned taxonomic category | ||
for the organism See here - | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,8 @@ | |
from installed_clients.WorkspaceClient import Workspace | ||
from GenomeFileUtil.core.GenomeUtils import ( | ||
is_parent, propagate_cds_props_to_gene, warnings, parse_inferences, | ||
load_ontology_mappings, set_taxon_data, set_default_taxon_data | ||
load_ontology_mappings, set_taxon_data, set_default_taxon_data, | ||
set_up_single_params, validate_mass_params | ||
) | ||
|
||
MAX_MISC_FEATURE_SIZE = 10000 | ||
|
@@ -113,52 +114,16 @@ def __init__(self, config): | |
|
||
def import_genbank(self, params): | ||
print('validating parameters') | ||
mass_params = self._set_up_single_params(params) | ||
mass_params = set_up_single_params( | ||
params, _WSNAME, self._validate_params, self.dfu.ws_name_to_id | ||
) | ||
return self._import_genbank_mass(mass_params)[0] | ||
|
||
def import_genbank_mass(self, params): | ||
print('validating parameters') | ||
self._validate_mass_params(params) | ||
validate_mass_params(params, self._validate_params) | ||
return self._import_genbank_mass(params) | ||
|
||
def _set_up_single_params(self, params): | ||
# avoid side effects and keep variables in params unmodfied | ||
inputs = dict(params) | ||
self._validate_params(inputs) | ||
ws_id = self._get_int(inputs.pop(_WSID, None), _WSID) | ||
ws_name = inputs.pop(_WSNAME, None) | ||
if (bool(ws_id) == bool(ws_name)): # xnor | ||
raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided") | ||
if not ws_id: | ||
print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting " | ||
+ "a workspace ID over a mutable workspace name that may cause race conditions") | ||
ws_id = self.dfu.ws_name_to_id(ws_name) | ||
mass_params = {_WSID: ws_id, _INPUTS: [inputs]} | ||
return mass_params | ||
|
||
def _validate_mass_params(self, params): | ||
ws_id = self._get_int(params.get(_WSID), _WSID) | ||
if not ws_id: | ||
raise ValueError(f"{_WSID} is required") | ||
inputs = params.get(_INPUTS) | ||
if not inputs or type(inputs) is not list: | ||
raise ValueError(f"{_INPUTS} field is required and must be a non-empty list") | ||
for i, inp in enumerate(inputs, start=1): | ||
if type(inp) is not dict: | ||
raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required") | ||
try: | ||
self._validate_params(inp) | ||
except Exception as e: | ||
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e | ||
|
||
def _get_int(self, putative_int, name, minimum=1): | ||
if putative_int is not None: | ||
if type(putative_int) is not int: | ||
raise ValueError(f"{name} must be an integer, got: {putative_int}") | ||
if putative_int < minimum: | ||
raise ValueError(f"{name} must be an integer >= {minimum}") | ||
return putative_int | ||
|
||
def _import_genbank_mass(self, params): | ||
|
||
workspace_id = params[_WSID] | ||
|
@@ -197,6 +162,12 @@ def _import_genbank_mass(self, params): | |
# parse genbank file | ||
self._parse_genbank(genome_obj) | ||
|
||
# check features | ||
self.gi.check_dna_sequence_in_features(genome_obj.genome_data) | ||
|
||
# validate genome | ||
genome_obj.genome_data['warnings'] = self.gi.validate_genome(genome_obj.genome_data) | ||
|
||
Comment on lines
+165
to
+170
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there tests for G2G that exercise these code paths? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, we have genbank_upload_full_test.py. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And there are tests that cause errors to be thown from the check / validate methods? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
https://github.com/kbaseapps/GenomeFileUtil/blob/master/test/utility/genome_size_tests.py There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hold on, I'm confused - these two checks are being added to the import mass function, but genbank_upload_full_test isn't changing. How can it test that these functions are called correctly from the mass function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're talking about test_full_sequence, test_partial_sequence, and test_no_sequence_kept? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, those three There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I put them in files and diff them there are differences. I don't know anything about this file other than what the git history says about it unfortunately There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. refactored genome_size_test.py script and added mass save tests for the check_dna_sequence_in_features function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes you made to the tests look good and are valuable in and of themselves, but reading through the tests I'm now not sure they actually exercise the check_dna_sequence_in_features function in a meaningful way. The only way that function does anything detectable is if there is at least one feature in the genome without dna sequence, in which case it will add dna sequences by calling a service to retrieve them. It's not clear to me whether the test genome file has any features that are missing dna sequences. If there aren't any then the 3 size tests will pass even if the line in the mass function that calls check_dna_sequence_in_features is deleted.
MrCreosote marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# gather all objects | ||
genome_objs.append(genome_obj) | ||
|
||
|
@@ -209,7 +180,6 @@ def _import_genbank_mass(self, params): | |
for genome_obj in genome_objs: | ||
shutil.rmtree(genome_obj.input_directory) | ||
|
||
# TODO make an internal mass function save_genomes | ||
results = self._save_genomes(workspace_id, genome_objs) | ||
|
||
# return the result | ||
|
@@ -227,17 +197,18 @@ def _import_genbank_mass(self, params): | |
return details | ||
|
||
def _save_genomes(self, workspace_id, genome_objs): | ||
results = [ | ||
self.gi.save_one_genome( | ||
{ | ||
'workspace': workspace_id, | ||
'name': genome_obj.genome_name, | ||
'data': genome_obj.genome_data, | ||
"meta": genome_obj.genome_meta, | ||
} | ||
) for genome_obj in genome_objs | ||
] | ||
|
||
results = self.gi.save_genome_mass( | ||
MrCreosote marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
"workspace_id": workspace_id, | ||
"inputs": [ | ||
{ | ||
"name": genome_obj.genome_name, | ||
"data": genome_obj.genome_data, | ||
"meta": genome_obj.genome_meta, | ||
} for genome_obj in genome_objs | ||
], | ||
} | ||
) | ||
return results | ||
|
||
def _validate_params(self, params): | ||
|
Uh oh!
There was an error while loading. Please reload this page.