From b3a6abc2f22fcbfe65f3b5d5ee9ffe192ea04b10 Mon Sep 17 00:00:00 2001 From: Soma Mbadiwe Date: Tue, 28 Jul 2020 00:41:28 -0400 Subject: [PATCH 1/2] Fixed Issue #271: Added support for functions taking a filepath --- allel/io/fasta.py | 2 +- allel/io/gff.py | 3 ++- allel/io/vcf_read.py | 6 +++++- allel/io/vcf_write.py | 3 +-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/allel/io/fasta.py b/allel/io/fasta.py index 80321aa7..6c6a904c 100644 --- a/allel/io/fasta.py +++ b/allel/io/fasta.py @@ -36,7 +36,7 @@ def write_fasta(path, sequences, names, mode='w', width=80): mode = 'ab' if 'a' in mode else 'wb' # write to file - with open(path, mode=mode) as fasta: + with open(str(path), mode=mode) as fasta: for name, sequence in zip(names, sequences): # force bytes if isinstance(name, str): diff --git a/allel/io/gff.py b/allel/io/gff.py index cda4493f..1d455b0b 100644 --- a/allel/io/gff.py +++ b/allel/io/gff.py @@ -31,7 +31,7 @@ def iter_gff3(path, attributes=None, region=None, score_fill=-1, Parameters ---------- - path : string + path : string or pathlib.Path Path to input file. attributes : list of strings, optional List of columns to extract from the "attributes" field. @@ -64,6 +64,7 @@ def iter_gff3(path, attributes=None, region=None, score_fill=-1, attributes_fill = [attributes_fill] * len(attributes) # open input stream + path = str(path) if region is not None: cmd = [tabix, path, region] buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py index 9eae0261..1805f167 100644 --- a/allel/io/vcf_read.py +++ b/allel/io/vcf_read.py @@ -15,7 +15,7 @@ import subprocess import textwrap from collections import OrderedDict - +from pathlib import Path import numpy as np @@ -1012,6 +1012,10 @@ def _setup_input_stream(input, region=None, tabix=None, buffer_size=DEFAULT_BUFF # obtain a file-like object close = False + + if isinstance(input, Path): + input = str(input) + if isinstance(input, str) and input.endswith('gz'): if region and tabix and os.name != 'nt': diff --git a/allel/io/vcf_write.py b/allel/io/vcf_write.py index 7c7cef44..86651d18 100644 --- a/allel/io/vcf_write.py +++ b/allel/io/vcf_write.py @@ -5,7 +5,6 @@ from operator import itemgetter import logging - import numpy as np @@ -50,7 +49,7 @@ def write_vcf(path, callset, rename=None, number=None, description=None, names, callset = normalize_callset(callset) - with open(path, 'w') as vcf_file: + with open(str(path), 'w') as vcf_file: if write_header: write_vcf_header(vcf_file, names, callset=callset, rename=rename, number=number, description=description) From 4432362fc2dea5706ad358f6b4bab4186fb70a60 Mon Sep 17 00:00:00 2001 From: Soma Mbadiwe Date: Fri, 31 Jul 2020 02:02:45 -0400 Subject: [PATCH 2/2] Fixed Issue #271: Added support for functions taking a filepath. Changes to accept even more file options: IOBase, pathlib.Path, file path. --- allel/io/fasta.py | 12 ++++++++++-- allel/io/gff.py | 27 ++++++++++++++++----------- allel/io/vcf_read.py | 9 ++++----- allel/io/vcf_write.py | 12 +++++++++++- allel/util.py | 11 +++++++++++ 5 files changed, 52 insertions(+), 19 deletions(-) diff --git a/allel/io/fasta.py b/allel/io/fasta.py index 6c6a904c..6e3fbbc8 100644 --- a/allel/io/fasta.py +++ b/allel/io/fasta.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import numpy as np +from allel.util import resolve_path def write_fasta(path, sequences, names, mode='w', width=80): @@ -35,8 +36,7 @@ def write_fasta(path, sequences, names, mode='w', width=80): # force binary mode mode = 'ab' if 'a' in mode else 'wb' - # write to file - with open(str(path), mode=mode) as fasta: + def save_as_fasta(fasta): for name, sequence in zip(names, sequences): # force bytes if isinstance(name, str): @@ -46,3 +46,11 @@ def write_fasta(path, sequences, names, mode='w', width=80): for i in range(0, sequence.size, width): line = sequence[i:i+width].tostring() + b'\n' fasta.write(line) + + # write to file + path = resolve_path(path) + if hasattr(path, 'write'): + save_as_fasta(path) + else: + with open(path, mode=mode) as f: + save_as_fasta(f) diff --git a/allel/io/gff.py b/allel/io/gff.py index 1d455b0b..c8961e1d 100644 --- a/allel/io/gff.py +++ b/allel/io/gff.py @@ -2,6 +2,7 @@ import subprocess import gzip from urllib.parse import unquote_plus +from allel.util import resolve_path import numpy as np @@ -31,7 +32,7 @@ def iter_gff3(path, attributes=None, region=None, score_fill=-1, Parameters ---------- - path : string or pathlib.Path + path : string, pathlib.Path or any file-like object Path to input file. attributes : list of strings, optional List of columns to extract from the "attributes" field. @@ -64,15 +65,19 @@ def iter_gff3(path, attributes=None, region=None, score_fill=-1, attributes_fill = [attributes_fill] * len(attributes) # open input stream - path = str(path) - if region is not None: - cmd = [tabix, path, region] - buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout - elif path.endswith('.gz') or path.endswith('.bgz'): - buffer = gzip.open(path, mode='rb') - else: - buffer = open(path, mode='rb') + # write to file + path = resolve_path(path) + if isinstance(path, str): + if region is not None: + cmd = [tabix, path, region] + buffer = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout + elif path.endswith('.gz') or path.endswith('.bgz'): + buffer = gzip.open(path, mode='rb') + else: + buffer = open(path, mode='rb') + else: + buffer = path try: for line in buffer: if line[0] == b'>': @@ -124,7 +129,7 @@ def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1, Parameters ---------- - path : string + path : string, pathlib.Path or any file-like object Path to input file. attributes : list of strings, optional List of columns to extract from the "attributes" field. @@ -181,7 +186,7 @@ def gff3_to_dataframe(path, attributes=None, region=None, score_fill=-1, Parameters ---------- - path : string + path : string, pathlib.Path or any file-like object Path to input file. attributes : list of strings, optional List of columns to extract from the "attributes" field. diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py index 1805f167..4677e7e5 100644 --- a/allel/io/vcf_read.py +++ b/allel/io/vcf_read.py @@ -15,8 +15,7 @@ import subprocess import textwrap from collections import OrderedDict -from pathlib import Path - +from allel.util import resolve_path import numpy as np @@ -434,8 +433,9 @@ def vcf_to_npz(input, output, """ + output = resolve_path(output) # guard condition - if not overwrite and os.path.exists(output): + if not overwrite and isinstance(output, str) and os.path.exists(output): raise ValueError('file exists at path %r; use overwrite=True to replace' % output) # read all data into memory @@ -1013,8 +1013,7 @@ def _setup_input_stream(input, region=None, tabix=None, buffer_size=DEFAULT_BUFF # obtain a file-like object close = False - if isinstance(input, Path): - input = str(input) + input = resolve_path(input) if isinstance(input, str) and input.endswith('gz'): diff --git a/allel/io/vcf_write.py b/allel/io/vcf_write.py index 86651d18..8cdf4c56 100644 --- a/allel/io/vcf_write.py +++ b/allel/io/vcf_write.py @@ -8,6 +8,7 @@ import numpy as np +from allel.util import resolve_path import allel @@ -49,12 +50,19 @@ def write_vcf(path, callset, rename=None, number=None, description=None, names, callset = normalize_callset(callset) - with open(str(path), 'w') as vcf_file: + def write_file(vcf_file): if write_header: write_vcf_header(vcf_file, names, callset=callset, rename=rename, number=number, description=description) write_vcf_data(vcf_file, names, callset=callset, rename=rename, fill=fill) + path = resolve_path(path) + if hasattr(path, 'write'): + write_file(path) + else: + with open(path, 'w') as f: + write_file(f) + def write_vcf_header(vcf_file, names, callset, rename, number, description): if rename is None: @@ -64,6 +72,7 @@ def write_vcf_header(vcf_file, names, callset, rename, number, description): if description is None: description = dict() + vcf_file = resolve_path(vcf_file) # write file format version print('##fileformat=VCFv4.1', file=vcf_file) @@ -152,6 +161,7 @@ def write_vcf_data(vcf_file, names, callset, rename, fill): if fill is None: fill = dict() + vcf_file = resolve_path(vcf_file) # find the fixed columns, allowing for case insensitive naming in the # input array col_chrom = None diff --git a/allel/util.py b/allel/util.py index fb85807e..7ca882d3 100644 --- a/allel/util.py +++ b/allel/util.py @@ -9,6 +9,17 @@ import numpy as np +def resolve_path(path): + """ + + :param path: + :return: + """ + if hasattr(path, 'read') or isinstance(path, (str, bytes)): + return path + return os.fspath(path) + + @contextmanager def ignore_invalid(): err = np.seterr(invalid='ignore')