Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 23 additions & 44 deletions gffutils/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,64 +675,43 @@ def _populate_from_lines(self, lines):
def _update_relations(self):
logger.info("Updating relations")
c = self.conn.cursor()
c2 = self.conn.cursor()
c3 = self.conn.cursor()

# TODO: pre-compute indexes?
# c.execute('CREATE INDEX ids ON features (id)')
# c.execute('CREATE INDEX parentindex ON relations (parent)')
# c.execute('CREATE INDEX childindex ON relations (child)')
# self.conn.commit()

if isinstance(self._keep_tempfiles, str):
suffix = self._keep_tempfiles
else:
suffix = ".gffutils"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
with open(tmp, "w") as fout:

# Here we look for "grandchildren" -- for each ID, get the child
# (parenthetical subquery below); then for each of those get *its*
# child (main query below).
#
# Results are written to temp file so that we don't read and write at
# the same time, which would slow things down considerably.

c.execute("SELECT id FROM features")
for parent in c:
c2.execute(
"""
SELECT child FROM relations WHERE parent IN
(SELECT child FROM relations WHERE parent = ?)
""",
tuple(parent),
c.execute(
"""
WITH RECURSIVE generation AS (
SELECT parent,
child,
level
FROM relations
WHERE level = 1

UNION ALL

SELECT g.parent,
child.child,
g.level+1 AS level
FROM relations child
JOIN generation g
ON g.child = child.parent
)
for grandchild in c2:
fout.write("\t".join((parent[0], grandchild[0])) + "\n")

def relations_generator():
with open(fout.name) as fin:
for line in fin:
parent, child = line.strip().split("\t")
yield dict(parent=parent, child=child, level=2)

c.executemany(
"""
INSERT OR IGNORE INTO relations VALUES
(:parent, :child, :level)
""",
relations_generator(),
)

INSERT OR IGNORE INTO relations
SELECT * FROM generation
WHERE level > 1;
"""
)

# TODO: Index creation. Which ones affect performance?
c.execute("DROP INDEX IF EXISTS binindex")
c.execute("CREATE INDEX binindex ON features (bin)")

self.conn.commit()

if not self._keep_tempfiles:
os.unlink(fout.name)


class _GTFDBCreator(_DBCreator):
def __init__(self, *args, **kwargs):
Expand Down
9 changes: 0 additions & 9 deletions gffutils/test/test_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,15 +961,6 @@ def test_tempfiles():
assert len(filelist) == 1, filelist
assert filelist[0].endswith(".gffutils")

# ...and another one for gff. This time, make sure the suffix
db = gffutils.create_db(
gffutils.example_filename("FBgn0031208.gff"), ":memory:", _keep_tempfiles=True
)
filelist = os.listdir(tempdir)
assert len(filelist) == 2, filelist
for i in filelist:
assert i.endswith(".gffutils")

# OK, now delete what we have so far...
clean_tempdir()

Expand Down
39 changes: 39 additions & 0 deletions gffutils/test/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,45 @@ def test_issue_198():
assert f.attributes["Parent"] == ["XM_001475631.1", ""]


def test_issue_204():
txt = dedent("""\
chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1;
chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1;
""")

db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True)

parents1 = {
"g1903": [],
"g1903.t1": ["g1903"],
"g1903.t1.d1": ["g1903.t1"],
"g1903.t1.d1.1": ["g1903.t1.d1"],
}
parents2 = {
"g1903": [],
"g1903.t1": [],
"g1903.t1.d1": ["g1903"],
"g1903.t1.d1.1": ["g1903.t1"],
}
parents3 = {
"g1903": [],
"g1903.t1": [],
"g1903.t1.d1": [],
"g1903.t1.d1.1": ["g1903"],
}
expected_levels = [parents1, parents2, parents3]

for i, expected_level in enumerate(expected_levels):
for child, expected_parents in expected_level.items():
level = i + 1
observed_parents = [i.id for i in db.parents(child, level=level)]
print(f"observed level {str(level)} anscestors for {child}:", set(observed_parents))
print(f"expected level {str(level)} anscestors for {child}:", set(expected_parents))
assert set(observed_parents) == set(expected_parents)


def test_issue_207():
def _check(txt, expected_keys, dialect_trailing_semicolon):
db = gffutils.create_db(txt.replace(" ", "\t"), ":memory:", from_string=True)
Expand Down