daler · aliciaaevans · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/gffutils/create.py b/gffutils/create.py
@@ -675,64 +675,43 @@ def _populate_from_lines(self, lines):
     def _update_relations(self):
         logger.info("Updating relations")
         c = self.conn.cursor()
-        c2 = self.conn.cursor()
-        c3 = self.conn.cursor()
 
         # TODO: pre-compute indexes?
         # c.execute('CREATE INDEX ids ON features (id)')
         # c.execute('CREATE INDEX parentindex ON relations (parent)')
         # c.execute('CREATE INDEX childindex ON relations (child)')
         # self.conn.commit()
 
-        if isinstance(self._keep_tempfiles, str):
-            suffix = self._keep_tempfiles
-        else:
-            suffix = ".gffutils"
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
-        with open(tmp, "w") as fout:
-
-            # Here we look for "grandchildren" -- for each ID, get the child
-            # (parenthetical subquery below); then for each of those get *its*
-            # child (main query below).
-            #
-            # Results are written to temp file so that we don't read and write at
-            # the same time, which would slow things down considerably.
-
-            c.execute("SELECT id FROM features")
-            for parent in c:
-                c2.execute(
-                    """
-                           SELECT child FROM relations WHERE parent IN
-                           (SELECT child FROM relations WHERE parent = ?)
-                           """,
-                    tuple(parent),
+        c.execute(
+                """
+                WITH RECURSIVE generation AS (
+                    SELECT parent,
+                        child,
+                        level
+                    FROM relations
+                    WHERE level = 1
+
+                UNION ALL
+
+                    SELECT g.parent,
+                        child.child,
+                        g.level+1 AS level
+                    FROM relations child
+                    JOIN generation g
+                    ON g.child = child.parent
                 )
-                for grandchild in c2:
-                    fout.write("\t".join((parent[0], grandchild[0])) + "\n")
-
-        def relations_generator():
-            with open(fout.name) as fin:
-                for line in fin:
-                    parent, child = line.strip().split("\t")
-                    yield dict(parent=parent, child=child, level=2)
-
-        c.executemany(
-            """
-            INSERT OR IGNORE INTO relations VALUES
-            (:parent, :child, :level)
-            """,
-            relations_generator(),
-        )
-
+                INSERT OR IGNORE INTO relations
+                SELECT * FROM generation
+                WHERE level > 1;
+                        """
+            )
+
         # TODO: Index creation.  Which ones affect performance?
         c.execute("DROP INDEX IF EXISTS binindex")
         c.execute("CREATE INDEX binindex ON features (bin)")
 
         self.conn.commit()
 
-        if not self._keep_tempfiles:
-            os.unlink(fout.name)
-
 
 class _GTFDBCreator(_DBCreator):
     def __init__(self, *args, **kwargs):

diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py
@@ -961,15 +961,6 @@ def test_tempfiles():
     assert len(filelist) == 1, filelist
     assert filelist[0].endswith(".gffutils")
 
-    # ...and another one for gff. This time, make sure the suffix
-    db = gffutils.create_db(
-        gffutils.example_filename("FBgn0031208.gff"), ":memory:", _keep_tempfiles=True
-    )
-    filelist = os.listdir(tempdir)
-    assert len(filelist) == 2, filelist
-    for i in filelist:
-        assert i.endswith(".gffutils")
-
     # OK, now delete what we have so far...
     clean_tempdir()
 

diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py
@@ -473,6 +473,45 @@ def test_issue_198():
     assert f.attributes["Parent"] == ["XM_001475631.1", ""]
 
 
+def test_issue_204():
+    txt = dedent("""\
+    chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+    chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
+    chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1;
+    chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1;
+    """)
+
+    db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True)
+
+    parents1 = {
+        "g1903": [],
+        "g1903.t1": ["g1903"],
+        "g1903.t1.d1": ["g1903.t1"],
+        "g1903.t1.d1.1": ["g1903.t1.d1"],
+    }
+    parents2 = {
+        "g1903": [],
+        "g1903.t1": [],
+        "g1903.t1.d1": ["g1903"],
+        "g1903.t1.d1.1": ["g1903.t1"],
+    }
+    parents3 = {
+        "g1903": [],
+        "g1903.t1": [],
+        "g1903.t1.d1": [],
+        "g1903.t1.d1.1": ["g1903"],
+    }
+    expected_levels = [parents1, parents2, parents3]
+
+    for i, expected_level in enumerate(expected_levels):
+        for child, expected_parents in expected_level.items():
+            level = i + 1
+            observed_parents = [i.id for i in db.parents(child, level=level)]
+            print(f"observed level {str(level)} anscestors for {child}:", set(observed_parents))
+            print(f"expected level {str(level)} anscestors for {child}:", set(expected_parents))
+            assert set(observed_parents) == set(expected_parents)
+
+
 def test_issue_207():
     def _check(txt, expected_keys, dialect_trailing_semicolon):
         db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True)