diff --git a/gffutils/create.py b/gffutils/create.py index e137c1a..09c71dc 100644 --- a/gffutils/create.py +++ b/gffutils/create.py @@ -675,8 +675,6 @@ def _populate_from_lines(self, lines): def _update_relations(self): logger.info("Updating relations") c = self.conn.cursor() - c2 = self.conn.cursor() - c3 = self.conn.cursor() # TODO: pre-compute indexes? # c.execute('CREATE INDEX ids ON features (id)') @@ -684,55 +682,36 @@ def _update_relations(self): # c.execute('CREATE INDEX childindex ON relations (child)') # self.conn.commit() - if isinstance(self._keep_tempfiles, str): - suffix = self._keep_tempfiles - else: - suffix = ".gffutils" - tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name - with open(tmp, "w") as fout: - - # Here we look for "grandchildren" -- for each ID, get the child - # (parenthetical subquery below); then for each of those get *its* - # child (main query below). - # - # Results are written to temp file so that we don't read and write at - # the same time, which would slow things down considerably. - - c.execute("SELECT id FROM features") - for parent in c: - c2.execute( - """ - SELECT child FROM relations WHERE parent IN - (SELECT child FROM relations WHERE parent = ?) - """, - tuple(parent), + c.execute( + """ + WITH RECURSIVE generation AS ( + SELECT parent, + child, + level + FROM relations + WHERE level = 1 + + UNION ALL + + SELECT g.parent, + child.child, + g.level+1 AS level + FROM relations child + JOIN generation g + ON g.child = child.parent ) - for grandchild in c2: - fout.write("\t".join((parent[0], grandchild[0])) + "\n") - - def relations_generator(): - with open(fout.name) as fin: - for line in fin: - parent, child = line.strip().split("\t") - yield dict(parent=parent, child=child, level=2) - - c.executemany( - """ - INSERT OR IGNORE INTO relations VALUES - (:parent, :child, :level) - """, - relations_generator(), - ) - + INSERT OR IGNORE INTO relations + SELECT * FROM generation + WHERE level > 1; + """ + ) + # TODO: Index creation. Which ones affect performance? c.execute("DROP INDEX IF EXISTS binindex") c.execute("CREATE INDEX binindex ON features (bin)") self.conn.commit() - if not self._keep_tempfiles: - os.unlink(fout.name) - class _GTFDBCreator(_DBCreator): def __init__(self, *args, **kwargs): diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py index 2b88cc0..9d58e351 100644 --- a/gffutils/test/test_1.py +++ b/gffutils/test/test_1.py @@ -961,15 +961,6 @@ def test_tempfiles(): assert len(filelist) == 1, filelist assert filelist[0].endswith(".gffutils") - # ...and another one for gff. This time, make sure the suffix - db = gffutils.create_db( - gffutils.example_filename("FBgn0031208.gff"), ":memory:", _keep_tempfiles=True - ) - filelist = os.listdir(tempdir) - assert len(filelist) == 2, filelist - for i in filelist: - assert i.endswith(".gffutils") - # OK, now delete what we have so far... clean_tempdir() diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 79996ba..276c14f 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -473,6 +473,45 @@ def test_issue_198(): assert f.attributes["Parent"] == ["XM_001475631.1", ""] +def test_issue_204(): + txt = dedent("""\ + chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903; + chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903; + chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1; + chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1; + """) + + db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True) + + parents1 = { + "g1903": [], + "g1903.t1": ["g1903"], + "g1903.t1.d1": ["g1903.t1"], + "g1903.t1.d1.1": ["g1903.t1.d1"], + } + parents2 = { + "g1903": [], + "g1903.t1": [], + "g1903.t1.d1": ["g1903"], + "g1903.t1.d1.1": ["g1903.t1"], + } + parents3 = { + "g1903": [], + "g1903.t1": [], + "g1903.t1.d1": [], + "g1903.t1.d1.1": ["g1903"], + } + expected_levels = [parents1, parents2, parents3] + + for i, expected_level in enumerate(expected_levels): + for child, expected_parents in expected_level.items(): + level = i + 1 + observed_parents = [i.id for i in db.parents(child, level=level)] + print(f"observed level {str(level)} anscestors for {child}:", set(observed_parents)) + print(f"expected level {str(level)} anscestors for {child}:", set(expected_parents)) + assert set(observed_parents) == set(expected_parents) + + def test_issue_207(): def _check(txt, expected_keys, dialect_trailing_semicolon): db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True)