From f093a774fb44f6f9ef9e44d05567c788f2563cc0 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 26 Feb 2025 14:18:23 +0530 Subject: [PATCH 01/16] Improve chromsizes file validation to catch formatting errors early (#209) --- docs/releasenotes.md | 2 +- src/cooler/util.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/releasenotes.md b/docs/releasenotes.md index cf547089..43fe6b94 120000 --- a/docs/releasenotes.md +++ b/docs/releasenotes.md @@ -1 +1 @@ -../CHANGES.md \ No newline at end of file +../CHANGES.md diff --git a/src/cooler/util.py b/src/cooler/util.py index d25b65ed..375516b0 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -235,7 +235,6 @@ def read_chromsizes( ---------- * `UCSC assembly terminology `_ * `GRC assembly terminology `_ - """ if isinstance(filepath_or, str) and filepath_or.endswith(".gz"): kwargs.setdefault("compression", "gzip") @@ -247,6 +246,17 @@ def read_chromsizes( dtype={"name": str}, **kwargs, ) + + # Convert the "length" column to numeric values. + chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") + if chromtable["length"].isnull().any(): + raise ValueError( + f"Chromsizes file '{filepath_or}' contains missing or invalid " + "length values. Please ensure that the file is properly formatted " + "as tab-delimited with two columns: sequence name and integer " + "length. Check for extraneous spaces or hidden characters." + ) + if not all_names: parts = [] for pattern in name_patterns: From c8616a858fe1828f6205d41f75f3cd30012c6ffb Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sat, 1 Mar 2025 07:06:29 +0530 Subject: [PATCH 02/16] Added a test function for the chromsize check introduced in src/cooler/util.py --- tests/test_chromsize_check.py | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 tests/test_chromsize_check.py diff --git a/tests/test_chromsize_check.py b/tests/test_chromsize_check.py new file mode 100644 index 00000000..a3507db8 --- /dev/null +++ b/tests/test_chromsize_check.py @@ -0,0 +1,72 @@ +import io +from typing import TextIO # Import TextIO for type hinting + +import pandas as pd +import pytest +from numpy import argsort as argnatsort + + +# Updated read_chromsizes function with TextIO for Python 3.11+ +def read_chromsizes( + filepath_or: str | TextIO, # Use TextIO for file-like objects + name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), + all_names: bool = False, + **kwargs, +) -> pd.Series: + """ + Parse a `.chrom.sizes or .chromInfo.txt file from the UCSC + database, where `db` is a genome assembly name. + """ + # Handle .gz files + if isinstance(filepath_or, str) and filepath_or.endswith(".gz"): + kwargs.setdefault("compression", "gzip") + + # Read the chromosome size file into a DataFrame + chromtable = pd.read_csv( + filepath_or, + sep="\t", + usecols=[0, 1], + names=["name", "length"], + dtype={"name": str}, + **kwargs, + ) + + # Convert the "length" column to numeric values, coercing errors to NaN + chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") + + # Raise an error if there are any invalid (NaN) lengths + if chromtable["length"].isnull().any(): + raise ValueError( + f"Chromsizes file '{filepath_or}' contains missing or invalid " + "length values. Please ensure that the file is properly formatted " + "as tab-delimited with two columns: sequence name and integer " + "length. Check for extraneous spaces or hidden characters." + ) + + # Filter chromosomes by pattern and sort them + if not all_names: + parts = [] + for pattern in name_patterns: + part = chromtable[chromtable["name"].str.contains(pattern)] + part = part.iloc[argnatsort(part["name"])] + parts.append(part) + chromtable = pd.concat(parts, axis=0) + + # Set the chromosome names as the index + chromtable.index = chromtable["name"].values + return chromtable["length"] + + +# Test for the read_chromsizes function with invalid length value +def test_read_chromsizes_bad_input(): + broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n" + broken_file = io.StringIO(broken_data) + + # Expect a ValueError due to the non-numeric value in the "length" column. + with pytest.raises(ValueError, match="Chromsizes file '.*' invalid length values"): + read_chromsizes(broken_file) + + +# Main function to run the test +if __name__ == "__main__": + pytest.main([__file__]) From a7bc88321b54d27b9ada3be33f24abc3bea421c6 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Sat, 1 Mar 2025 12:42:43 +0530 Subject: [PATCH 03/16] Fixed pytest error in test_chromsize_check.py and made minor tweaks to rea-chromsize in util.py --- src/cooler/util.py | 1731 +++++++++++++++++---------------- tests/test_chromsize_check.py | 77 +- 2 files changed, 890 insertions(+), 918 deletions(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index 375516b0..b5fd08ad 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -1,860 +1,871 @@ -from __future__ import annotations - -import os -import re -from collections import OrderedDict, defaultdict -from collections.abc import Generator, Iterable, Iterator -from contextlib import contextmanager -from typing import IO, Any - -import h5py -import numpy as np -import pandas as pd -from pandas.api.types import is_integer, is_scalar - -from ._typing import GenomicRangeSpecifier, GenomicRangeTuple - - -def partition(start: int, stop: int, step: int) -> Iterator[tuple[int, int]]: - """Partition an integer interval into equally-sized subintervals. - Like builtin :py:func:`range`, but yields pairs of end points. - - Examples - -------- - >>> for lo, hi in partition(0, 9, 2): - print(lo, hi) - 0 2 - 2 4 - 4 6 - 6 8 - 8 9 - - """ - return ((i, min(i + step, stop)) for i in range(start, stop, step)) - - -def parse_cooler_uri(s: str) -> tuple[str, str]: - """ - Parse a Cooler URI string - - e.g. /path/to/mycoolers.cool::/path/to/cooler - - """ - parts = s.split("::") - if len(parts) == 1: - file_path, group_path = parts[0], "/" - elif len(parts) == 2: - file_path, group_path = parts - if not group_path.startswith("/"): - group_path = "/" + group_path - else: - raise ValueError("Invalid Cooler URI string") - return file_path, group_path - - -def atoi(s: str) -> int: - return int(s.replace(",", "")) - - -def parse_humanized(s: str) -> int: - _NUMERIC_RE = re.compile("([0-9,.]+)") - _, value, unit = _NUMERIC_RE.split(s.replace(",", "")) - if not len(unit): - return int(value) - - value = float(value) - unit = unit.upper().strip() - if unit in ("K", "KB"): - value *= 1000 - elif unit in ("M", "MB"): - value *= 1000000 - elif unit in ("G", "GB"): - value *= 1000000000 - else: - raise ValueError(f"Unknown unit '{unit}'") - return int(value) - - -def parse_region_string(s: str) -> tuple[str, int | None, int | None]: - """ - Parse a UCSC-style genomic region string into a triple. - - Parameters - ---------- - s : str - UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA - style sequence names are allowed. End coordinate must be greater than - or equal to start. - - Returns - ------- - (str, int or None, int or None) - - """ - - def _tokenize(s): - token_spec = [ - ("HYPHEN", r"-"), - ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"), - ("OTHER", r".+"), - ] - pattern = r"|\s*".join([rf"(?P<{pair[0]}>{pair[1]})" for pair in token_spec]) - tok_regex = re.compile(rf"\s*{pattern}", re.IGNORECASE) - for match in tok_regex.finditer(s): - typ = match.lastgroup - yield typ, match.group(typ) - - def _check_token(typ, token, expected): - if typ is None: - raise ValueError("Expected {} token missing".format(" or ".join(expected))) - else: - if typ not in expected: - raise ValueError(f'Unexpected token "{token}"') - - def _expect(tokens): - typ, token = next(tokens, (None, None)) - _check_token(typ, token, ["COORD"]) - start = parse_humanized(token) - - typ, token = next(tokens, (None, None)) - _check_token(typ, token, ["HYPHEN"]) - - typ, token = next(tokens, (None, None)) - if typ is None: - return start, None - - _check_token(typ, token, ["COORD"]) - end = parse_humanized(token) - if end < start: - raise ValueError("End coordinate less than start") - - return start, end - - parts = s.split(":") - chrom = parts[0].strip() - if not len(chrom): - raise ValueError("Chromosome name cannot be empty") - if len(parts) < 2: - return (chrom, None, None) - start, end = _expect(_tokenize(parts[1])) - return (chrom, start, end) - - -def parse_region( - reg: GenomicRangeSpecifier, - chromsizes: dict | pd.Series | None = None -) -> GenomicRangeTuple: - """ - Genomic regions are represented as half-open intervals (0-based starts, - 1-based ends) along the length coordinate of a contig/scaffold/chromosome. - - Parameters - ---------- - reg : str or tuple - UCSC-style genomic region string, or - Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``. - chromsizes : mapping, optional - Lookup table of scaffold lengths to check against ``chrom`` and the - ``end`` coordinate. Required if ``end`` is not supplied. - - Returns - ------- - A well-formed genomic region triple (str, int, int) - - """ - if isinstance(reg, str): - chrom, start, end = parse_region_string(reg) - else: - chrom, start, end = reg - start = int(start) if start is not None else start - end = int(end) if end is not None else end - - try: - clen = chromsizes[chrom] if chromsizes is not None else None - except KeyError as e: - raise ValueError(f"Unknown sequence label: {chrom}") from e - - start = 0 if start is None else start - if end is None: - if clen is None: # TODO --- remove? - raise ValueError("Cannot determine end coordinate.") - end = clen - - if end < start: - raise ValueError("End cannot be less than start") - - if start < 0 or (clen is not None and end > clen): - raise ValueError(f"Genomic region out of bounds: [{start}, {end})") - - return chrom, start, end - - -def natsort_key(s: str, _NS_REGEX=re.compile(r"(\d+)", re.U)) -> tuple: - return tuple([int(x) if x.isdigit() else x for x in _NS_REGEX.split(s) if x]) - - -def natsorted(iterable: Iterable[str]) -> list[str]: - return sorted(iterable, key=natsort_key) - - -def argnatsort(array: Iterable[str]) -> np.ndarray: - array = np.asarray(array) - if not len(array): - return np.array([], dtype=int) - cols = tuple(zip(*(natsort_key(x) for x in array))) - return np.lexsort(cols[::-1]) - - -def read_chromsizes( - filepath_or: str | IO[str], - name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), - all_names: bool = False, - **kwargs, -) -> pd.Series: - """ - Parse a ``.chrom.sizes`` or ``.chromInfo.txt`` file from the UCSC - database, where ``db`` is a genome assembly name. - - Parameters - ---------- - filepath_or : str or file-like - Path or url to text file, or buffer. - name_patterns : sequence, optional - Sequence of regular expressions to capture desired sequence names. - Each corresponding set of records will be sorted in natural order. - all_names : bool, optional - Whether to return all contigs listed in the file. Default is - ``False``. - - Returns - ------- - :py:class:`pandas.Series` - Series of integer bp lengths indexed by sequence name. - - References - ---------- - * `UCSC assembly terminology `_ - * `GRC assembly terminology `_ - """ - if isinstance(filepath_or, str) and filepath_or.endswith(".gz"): - kwargs.setdefault("compression", "gzip") - chromtable = pd.read_csv( - filepath_or, - sep="\t", - usecols=[0, 1], - names=["name", "length"], - dtype={"name": str}, - **kwargs, - ) - - # Convert the "length" column to numeric values. - chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") - if chromtable["length"].isnull().any(): - raise ValueError( - f"Chromsizes file '{filepath_or}' contains missing or invalid " - "length values. Please ensure that the file is properly formatted " - "as tab-delimited with two columns: sequence name and integer " - "length. Check for extraneous spaces or hidden characters." - ) - - if not all_names: - parts = [] - for pattern in name_patterns: - part = chromtable[chromtable["name"].str.contains(pattern)] - part = part.iloc[argnatsort(part["name"])] - parts.append(part) - chromtable = pd.concat(parts, axis=0) - chromtable.index = chromtable["name"].values - return chromtable["length"] - - -def fetch_chromsizes(db: str, **kwargs) -> pd.Series: - """ - Download chromosome sizes from UCSC as a :py:class:`pandas.Series`, indexed - by chromosome label. - - """ - return read_chromsizes( - f"http://hgdownload.soe.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz", - **kwargs, - ) - - -def load_fasta(names: list[str], *filepaths: str) -> OrderedDict[str, Any]: - """ - Load lazy FASTA records from one or multiple files without reading them - into memory. - - Parameters - ---------- - names : sequence of str - Names of sequence records in FASTA file or files. - filepaths : str - Paths to one or more FASTA files to gather records from. - - Returns - ------- - OrderedDict of sequence name -> sequence record - - """ - import pyfaidx - - if len(filepaths) == 0: - raise ValueError("Need at least one file") - - if len(filepaths) == 1: - fa = pyfaidx.Fasta(filepaths[0], as_raw=True) - - else: - fa = {} - for filepath in filepaths: - fa.update(pyfaidx.Fasta(filepath, as_raw=True).records) - - records = OrderedDict((chrom, fa[chrom]) for chrom in names) - return records - - -def binnify(chromsizes: pd.Series, binsize: int) -> pd.DataFrame: - """ - Divide a genome into evenly sized bins. - - Parameters - ---------- - chromsizes : Series - pandas Series indexed by chromosome name with chromosome lengths in bp. - binsize : int - size of bins in bp - - Returns - ------- - bins : :py:class:`pandas.DataFrame` - Dataframe with columns: ``chrom``, ``start``, ``end``. - - """ - - def _each(chrom): - clen = chromsizes[chrom] - n_bins = int(np.ceil(clen / binsize)) - binedges = np.arange(0, (n_bins + 1)) * binsize - binedges[-1] = clen - return pd.DataFrame( - {"chrom": [chrom] * n_bins, "start": binedges[:-1], "end": binedges[1:]}, - columns=["chrom", "start", "end"], - ) - - bintable = pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True) - - bintable["chrom"] = pd.Categorical( - bintable["chrom"], categories=list(chromsizes.index), ordered=True - ) - - return bintable - - -make_bintable = binnify - - -def digest(fasta_records: OrderedDict[str, Any], enzyme: str) -> pd.DataFrame: - """ - Divide a genome into restriction fragments. - - Parameters - ---------- - fasta_records : OrderedDict - Dictionary of chromosome names to sequence records. - enzyme: str - Name of restriction enzyme (e.g., 'DpnII'). - - Returns - ------- - frags : :py:class:`pandas.DataFrame` - Dataframe with columns: ``chrom``, ``start``, ``end``. - - """ - try: - import Bio.Restriction as biorst - import Bio.Seq as bioseq - except ImportError: - raise ImportError( - "Biopython is required to find restriction fragments." - ) from None - - # http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698 - chroms = fasta_records.keys() - try: - cut_finder = getattr(biorst, enzyme).search - except AttributeError as e: - raise ValueError(f"Unknown enzyme name: {enzyme}") from e - - def _each(chrom): - seq = bioseq.Seq(str(fasta_records[chrom][:])) - cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(np.int64) - n_frags = len(cuts) - 1 - - frags = pd.DataFrame( - {"chrom": [chrom] * n_frags, "start": cuts[:-1], "end": cuts[1:]}, - columns=["chrom", "start", "end"], - ) - return frags - - return pd.concat(map(_each, chroms), axis=0, ignore_index=True) - - -def get_binsize(bins: pd.DataFrame) -> int | None: - """ - Infer bin size from a bin DataFrame. Assumes that the last bin of each - contig is allowed to differ in size from the rest. - - Returns - ------- - int or None if bins are non-uniform - - """ - sizes = set() - for _chrom, group in bins.groupby("chrom", observed=True): - sizes.update((group["end"] - group["start"]).iloc[:-1].unique()) - if len(sizes) > 1: - return None - if len(sizes) == 1: - return next(iter(sizes)) - else: - return None - - -def get_chromsizes(bins: pd.DataFrame) -> pd.Series: - """ - Infer chromsizes Series from a bin DataFrame. Assumes that the last bin of - each contig is allowed to differ in size from the rest. - - Returns - ------- - int or None if bins are non-uniform - - """ - chromtable = ( - bins.drop_duplicates(["chrom"], keep="last")[["chrom", "end"]] - .reset_index(drop=True) - .rename(columns={"chrom": "name", "end": "length"}) - ) - chroms, lengths = list(chromtable["name"]), list(chromtable["length"]) - return pd.Series(index=chroms, data=lengths) - - -def bedslice( - grouped, - chromsizes: pd.Series | dict, - region: GenomicRangeSpecifier, -) -> pd.DataFrame: - """ - Range query on a BED-like dataframe with non-overlapping intervals. - - """ - chrom, start, end = parse_region(region, chromsizes) - result = grouped.get_group(chrom) - if start > 0 or end < chromsizes[chrom]: - lo = result["end"].values.searchsorted(start, side="right") - hi = lo + result["start"].values[lo:].searchsorted(end, side="left") - result = result.iloc[lo:hi] - return result - - -def asarray_or_dataset(x: Any) -> np.ndarray | h5py.Dataset: - return x if isinstance(x, h5py.Dataset) else np.asarray(x) - - -def rlencode( - array: np.ndarray, - chunksize: int | None = None -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Run length encoding. - Based on http://stackoverflow.com/a/32681075, which is based on the rle - function from R. - - Parameters - ---------- - x : 1D array_like - Input array to encode - dropna: bool, optional - Drop all runs of NaNs. - - Returns - ------- - start positions, run lengths, run values - - """ - where = np.flatnonzero - array = asarray_or_dataset(array) - n = len(array) - if n == 0: - return ( - np.array([], dtype=int), - np.array([], dtype=int), - np.array([], dtype=array.dtype), - ) - - if chunksize is None: - chunksize = n - - starts, values = [], [] - last_val = np.nan - for i in range(0, n, chunksize): - x = array[i : i + chunksize] - locs = where(x[1:] != x[:-1]) + 1 - if x[0] != last_val: - locs = np.r_[0, locs] - starts.append(i + locs) - values.append(x[locs]) - last_val = x[-1] - starts = np.concatenate(starts) - lengths = np.diff(np.r_[starts, n]) - values = np.concatenate(values) - - return starts, lengths, values - - -def cmd_exists(cmd: str) -> bool: - return any( - os.access(os.path.join(path, cmd), os.X_OK) - for path in os.environ["PATH"].split(os.pathsep) - ) - - -def mad(data: np.ndarray, axis: int | None = None) -> np.ndarray: - return np.median(np.abs(data - np.median(data, axis)), axis) - - -@contextmanager -def open_hdf5( - fp: str | h5py.Group, - mode: str = "r", - *args, - **kwargs -) -> Generator[h5py.Group, None, None]: - """ - Context manager like ``h5py.File`` but accepts already open HDF5 file - handles which do not get closed on teardown. - - Parameters - ---------- - fp : str or ``h5py.File`` object - If an open file object is provided, it passes through unchanged, - provided that the requested mode is compatible. - If a filepath is passed, the context manager will close the file on - tear down. - - mode : str - * r Readonly, file must exist - * r+ Read/write, file must exist - * a Read/write if exists, create otherwise - * w Truncate if exists, create otherwise - * w- or x Fail if exists, create otherwise - - """ - if isinstance(fp, str): - own_fh = True - fh = h5py.File(fp, mode, *args, **kwargs) - else: - own_fh = False - if mode == "r" and fp.file.mode == "r+": - # warnings.warn("File object provided is writeable but intent is read-only") - pass - elif mode in ("r+", "a") and fp.file.mode == "r": - raise ValueError("File object provided is not writeable") - elif mode == "w": - raise ValueError("Cannot truncate open file") - elif mode in ("w-", "x"): - raise ValueError("File exists") - fh = fp - try: - yield fh - finally: - if own_fh: - fh.close() - - -class closing_hdf5(h5py.Group): - def __init__(self, grp: h5py.Group): - super().__init__(grp.id) - - def __enter__(self) -> h5py.Group: - return self - - def __exit__(self, *exc_info) -> None: - return self.file.close() - - def close(self) -> None: - self.file.close() - - -def attrs_to_jsonable(attrs: h5py.AttributeManager) -> dict: - out = dict(attrs) - for k, v in attrs.items(): - try: - out[k] = v.item() - except ValueError: - out[k] = v.tolist() - except AttributeError: - out[k] = v - return out - - -def infer_meta(x, index=None): # pragma: no cover - """ - Extracted and modified from dask/dataframe/utils.py : - make_meta (BSD licensed) - - Create an empty pandas object containing the desired metadata. - - Parameters - ---------- - x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar - To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or - an iterable of `(name, dtype)` tuples. To create a `Series`, provide a - tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index - should match the desired output. If a dtype or scalar, a scalar of the - same dtype is returned. - index : pd.Index, optional - Any pandas index to use in the metadata. If none provided, a - `RangeIndex` will be used. - - Examples - -------- - >>> make_meta([('a', 'i8'), ('b', 'O')]) - Empty DataFrame - Columns: [a, b] - Index: [] - >>> make_meta(('a', 'f8')) - Series([], Name: a, dtype: float64) - >>> make_meta('i8') - 1 - - """ - - _simple_fake_mapping = { - "b": np.bool_(True), - "V": np.void(b" "), - "M": np.datetime64("1970-01-01"), - "m": np.timedelta64(1), - "S": np.str_("foo"), - "a": np.str_("foo"), - "U": np.str_("foo"), - "O": "foo", - } - - UNKNOWN_CATEGORIES = "__UNKNOWN_CATEGORIES__" - - def _scalar_from_dtype(dtype): - if dtype.kind in ("i", "f", "u"): - return dtype.type(1) - elif dtype.kind == "c": - return dtype.type(complex(1, 0)) - elif dtype.kind in _simple_fake_mapping: - o = _simple_fake_mapping[dtype.kind] - return o.astype(dtype) if dtype.kind in ("m", "M") else o - else: - raise TypeError(f"Can't handle dtype: {dtype}") - - def _nonempty_scalar(x): - if isinstance(x, (pd.Timestamp, pd.Timedelta, pd.Period)): - return x - elif np.isscalar(x): - dtype = x.dtype if hasattr(x, "dtype") else np.dtype(type(x)) - return _scalar_from_dtype(dtype) - else: - raise TypeError("Can't handle meta of type " f"'{type(x).__name__}'") - - def _empty_series(name, dtype, index=None): - if isinstance(dtype, str) and dtype == "category": - return pd.Series( - pd.Categorical([UNKNOWN_CATEGORIES]), name=name, index=index - ).iloc[:0] - return pd.Series([], dtype=dtype, name=name, index=index) - - if hasattr(x, "_meta"): - return x._meta - if isinstance(x, (pd.Series, pd.DataFrame)): - return x.iloc[0:0] - elif isinstance(x, pd.Index): - return x[0:0] - index = index if index is None else index[0:0] - - if isinstance(x, dict): - return pd.DataFrame( - {c: _empty_series(c, d, index=index) for (c, d) in x.items()}, index=index - ) - if isinstance(x, tuple) and len(x) == 2: - return _empty_series(x[0], x[1], index=index) - elif isinstance(x, (list, tuple)): - if not all(isinstance(i, tuple) and len(i) == 2 for i in x): - raise ValueError( - "Expected iterable of tuples of (name, dtype), " f"got {x}" - ) - return pd.DataFrame( - {c: _empty_series(c, d, index=index) for (c, d) in x}, - columns=[c for c, d in x], - index=index, - ) - elif not hasattr(x, "dtype") and x is not None: - # could be a string, a dtype object, or a python type. Skip `None`, - # because it is implictly converted to `dtype('f8')`, which we don't - # want here. - try: - dtype = np.dtype(x) - return _scalar_from_dtype(dtype) - except: # noqa - # Continue on to next check - pass - - if is_scalar(x): - return _nonempty_scalar(x) - - raise TypeError(f"Don't know how to create metadata from {x}") - - -def get_meta( - columns, dtype=None, index_columns=None, index_names=None, default_dtype=np.object_ -): # pragma: no cover - """ - Extracted and modified from pandas/io/parsers.py : - _get_empty_meta (BSD licensed). - - """ - columns = list(columns) - - # Convert `dtype` to a defaultdict of some kind. - # This will enable us to write `dtype[col_name]` - # without worrying about KeyError issues later on. - if not isinstance(dtype, dict): - # if dtype == None, default will be default_dtype. - dtype = defaultdict(lambda: dtype or default_dtype) - else: - # Save a copy of the dictionary. - _dtype = dtype.copy() - dtype = defaultdict(lambda: default_dtype) - - # Convert column indexes to column names. - for k, v in _dtype.items(): - col = columns[k] if is_integer(k) else k - dtype[col] = v - - if index_columns is None or index_columns is False: - index = pd.Index([]) - else: - data = [pd.Series([], dtype=dtype[name]) for name in index_names] - if len(data) == 1: - index = pd.Index(data[0], name=index_names[0]) - else: - index = pd.MultiIndex.from_arrays(data, names=index_names) - index_columns.sort() - for i, n in enumerate(index_columns): - columns.pop(n - i) - - col_dict = {col_name: pd.Series([], dtype=dtype[col_name]) for col_name in columns} - - return pd.DataFrame(col_dict, columns=columns, index=index) - - -def check_bins(bins: pd.DataFrame, chromsizes: pd.Series) -> pd.DataFrame: - is_cat = isinstance(bins["chrom"].dtype, pd.CategoricalDtype) - bins = bins.copy() - if not is_cat: - bins["chrom"] = pd.Categorical( - bins.chrom, categories=list(chromsizes.index), ordered=True - ) - else: - assert (bins["chrom"].cat.categories == chromsizes.index).all() - - return bins - - -def balanced_partition( - gs: GenomeSegmentation, - n_chunk_max: int, - file_contigs: list[str], - loadings: list[int | float] | None = None -) -> list[GenomicRangeTuple]: - # n_bins = len(gs.bins) - grouped = gs._bins_grouped - - chrom_nbins = grouped.size() - if loadings is None: - loadings = chrom_nbins - chrmax = loadings.idxmax() - loadings = loadings / loadings.loc[chrmax] - const = chrom_nbins.loc[chrmax] / n_chunk_max - - granges = [] - for chrom, group in grouped: - if chrom not in file_contigs: - continue - clen = gs.chromsizes[chrom] - step = int(np.ceil(const / loadings.loc[chrom])) - anchors = group.start.values[::step] - if anchors[-1] != clen: - anchors = np.r_[anchors, clen] - granges.extend( - (chrom, start, end) for start, end in zip(anchors[:-1], anchors[1:]) - ) - return granges - - -class GenomeSegmentation: - def __init__(self, chromsizes: pd.Series, bins: pd.DataFrame): - bins = check_bins(bins, chromsizes) - self._bins_grouped = bins.groupby("chrom", observed=True, sort=False) - nbins_per_chrom = self._bins_grouped.size().values - - self.chromsizes = chromsizes - self.binsize = get_binsize(bins) - self.contigs = list(chromsizes.keys()) - self.bins = bins - self.idmap = pd.Series(index=chromsizes.keys(), data=range(len(chromsizes))) - self.chrom_binoffset = np.r_[0, np.cumsum(nbins_per_chrom)] - self.chrom_abspos = np.r_[0, np.cumsum(chromsizes.values)] - self.start_abspos = ( - self.chrom_abspos[bins["chrom"].cat.codes] + bins["start"].values - ) - - def fetch(self, region: GenomicRangeSpecifier) -> pd.DataFrame: - chrom, start, end = parse_region(region, self.chromsizes) - result = self._bins_grouped.get_group(chrom) - if start > 0 or end < self.chromsizes[chrom]: - lo = result["end"].values.searchsorted(start, side="right") - hi = lo + result["start"].values[lo:].searchsorted(end, side="left") - result = result.iloc[lo:hi] - return result - - -def buffered( - chunks: Iterable[pd.DataFrame], - size: int = 10000000 -) -> Iterator[pd.DataFrame]: - """ - Take an incoming iterator of small data frame chunks and buffer them into - an outgoing iterator of larger chunks. - - Parameters - ---------- - chunks : iterator of :py:class:`pandas.DataFrame` - Each chunk should have the same column names. - size : int - Minimum length of output chunks. - - Yields - ------ - Larger outgoing :py:class:`pandas.DataFrame` chunks made from concatenating - the incoming ones. - - """ - buf = [] - n = 0 - for chunk in chunks: - n += len(chunk) - buf.append(chunk) - if n > size: - yield pd.concat(buf, axis=0) - buf = [] - n = 0 - if len(buf): - yield pd.concat(buf, axis=0) +from __future__ import annotations + +import io +import os +import re +from collections import OrderedDict, defaultdict +from collections.abc import Generator, Iterable, Iterator +from contextlib import contextmanager +from typing import Any + +import h5py +import numpy as np +import pandas as pd +from pandas.api.types import is_integer, is_scalar + +from ._typing import GenomicRangeSpecifier, GenomicRangeTuple + + +def partition(start: int, stop: int, step: int) -> Iterator[tuple[int, int]]: + """Partition an integer interval into equally-sized subintervals. + Like builtin :py:func:`range`, but yields pairs of end points. + + Examples + -------- + >>> for lo, hi in partition(0, 9, 2): + print(lo, hi) + 0 2 + 2 4 + 4 6 + 6 8 + 8 9 + + """ + return ((i, min(i + step, stop)) for i in range(start, stop, step)) + + +def parse_cooler_uri(s: str) -> tuple[str, str]: + """ + Parse a Cooler URI string + + e.g. /path/to/mycoolers.cool::/path/to/cooler + + """ + parts = s.split("::") + if len(parts) == 1: + file_path, group_path = parts[0], "/" + elif len(parts) == 2: + file_path, group_path = parts + if not group_path.startswith("/"): + group_path = "/" + group_path + else: + raise ValueError("Invalid Cooler URI string") + return file_path, group_path + + +def atoi(s: str) -> int: + return int(s.replace(",", "")) + + +def parse_humanized(s: str) -> int: + _NUMERIC_RE = re.compile("([0-9,.]+)") + _, value, unit = _NUMERIC_RE.split(s.replace(",", "")) + if not len(unit): + return int(value) + + value = float(value) + unit = unit.upper().strip() + if unit in ("K", "KB"): + value *= 1000 + elif unit in ("M", "MB"): + value *= 1000000 + elif unit in ("G", "GB"): + value *= 1000000000 + else: + raise ValueError(f"Unknown unit '{unit}'") + return int(value) + + +def parse_region_string(s: str) -> tuple[str, int | None, int | None]: + """ + Parse a UCSC-style genomic region string into a triple. + + Parameters + ---------- + s : str + UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA + style sequence names are allowed. End coordinate must be greater than + or equal to start. + + Returns + ------- + (str, int or None, int or None) + + """ + + def _tokenize(s): + token_spec = [ + ("HYPHEN", r"-"), + ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"), + ("OTHER", r".+"), + ] + pattern = r"|\s*".join([rf"(?P<{pair[0]}>{pair[1]})" for pair in token_spec]) + tok_regex = re.compile(rf"\s*{pattern}", re.IGNORECASE) + for match in tok_regex.finditer(s): + typ = match.lastgroup + yield typ, match.group(typ) + + def _check_token(typ, token, expected): + if typ is None: + raise ValueError("Expected {} token missing".format(" or ".join(expected))) + else: + if typ not in expected: + raise ValueError(f'Unexpected token "{token}"') + + def _expect(tokens): + typ, token = next(tokens, (None, None)) + _check_token(typ, token, ["COORD"]) + start = parse_humanized(token) + + typ, token = next(tokens, (None, None)) + _check_token(typ, token, ["HYPHEN"]) + + typ, token = next(tokens, (None, None)) + if typ is None: + return start, None + + _check_token(typ, token, ["COORD"]) + end = parse_humanized(token) + if end < start: + raise ValueError("End coordinate less than start") + + return start, end + + parts = s.split(":") + chrom = parts[0].strip() + if not len(chrom): + raise ValueError("Chromosome name cannot be empty") + if len(parts) < 2: + return (chrom, None, None) + start, end = _expect(_tokenize(parts[1])) + return (chrom, start, end) + + +def parse_region( + reg: GenomicRangeSpecifier, + chromsizes: dict | pd.Series | None = None +) -> GenomicRangeTuple: + """ + Genomic regions are represented as half-open intervals (0-based starts, + 1-based ends) along the length coordinate of a contig/scaffold/chromosome. + + Parameters + ---------- + reg : str or tuple + UCSC-style genomic region string, or + Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``. + chromsizes : mapping, optional + Lookup table of scaffold lengths to check against ``chrom`` and the + ``end`` coordinate. Required if ``end`` is not supplied. + + Returns + ------- + A well-formed genomic region triple (str, int, int) + + """ + if isinstance(reg, str): + chrom, start, end = parse_region_string(reg) + else: + chrom, start, end = reg + start = int(start) if start is not None else start + end = int(end) if end is not None else end + + try: + clen = chromsizes[chrom] if chromsizes is not None else None + except KeyError as e: + raise ValueError(f"Unknown sequence label: {chrom}") from e + + start = 0 if start is None else start + if end is None: + if clen is None: # TODO --- remove? + raise ValueError("Cannot determine end coordinate.") + end = clen + + if end < start: + raise ValueError("End cannot be less than start") + + if start < 0 or (clen is not None and end > clen): + raise ValueError(f"Genomic region out of bounds: [{start}, {end})") + + return chrom, start, end + + +def natsort_key(s: str, _NS_REGEX=re.compile(r"(\d+)", re.U)) -> tuple: + return tuple([int(x) if x.isdigit() else x for x in _NS_REGEX.split(s) if x]) + + +def natsorted(iterable: Iterable[str]) -> list[str]: + return sorted(iterable, key=natsort_key) + + +def argnatsort(array: Iterable[str]) -> np.ndarray: + array = np.asarray(array) + if not len(array): + return np.array([], dtype=int) + cols = tuple(zip(*(natsort_key(x) for x in array))) + return np.lexsort(cols[::-1]) + +def read_chromsizes( + filepath_or: str | io.StringIO, + name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), + all_names: bool = False, + verbose: bool = False, # Optional parameter to enable verbose output + **kwargs, +) -> pd.Series: + """ + Parse a `.chrom.sizes` or `.chromInfo.txt` file from the UCSC + database, where `db` is a genome assembly name. + + Parameters + ---------- + filepath_or : str or file-like + Path or url to text file, or buffer. + name_patterns : sequence, optional + Sequence of regular expressions to capture desired sequence names. + all_names : bool, optional + Whether to return all contigs listed in the file. + verbose : bool, optional + Whether to enable verbose logging for diagnostics. + """ + # Check if the input is a file-like object (StringIO or file path) and inspect the first line for delimiters + if isinstance(filepath_or, (str, io.StringIO)): + first_line = None + if isinstance(filepath_or, io.StringIO): + first_line = filepath_or.getvalue().splitlines()[0] + elif isinstance(filepath_or, str): + with open(filepath_or) as file: + first_line = file.readline() + + if first_line and ' ' in first_line: + raise ValueError(f"Chromsizes file '{filepath_or}' uses spaces instead of tabs as delimiters. Please use tabs.") + + # Read the chromosome size file into a DataFrame + if verbose: + print(f"Reading chromsizes file: {filepath_or}") + + chromtable = pd.read_csv( + filepath_or, + sep="\t", # Ensuring tab is the delimiter + usecols=[0, 1], + names=["name", "length"], + dtype={"name": str}, + **kwargs, + ) + + + # Convert the "length" column to numeric values, coercing errors to NaN + chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") + + # Check for NaN values after reading the file + if chromtable["length"].isnull().any(): + invalid_rows = chromtable[chromtable["length"].isnull()] + if verbose: + print(f"Invalid rows detected: {invalid_rows}") + raise ValueError( + f"Chromsizes file '{filepath_or}' contains missing or invalid length values. " + "Please ensure that the file is properly formatted as tab-delimited with two columns: sequence name and integer length. " + "Check for extraneous spaces or hidden characters. Invalid rows: \n{invalid_rows}" + ) + + # Filter by patterns if needed + if not all_names: + parts = [] + for pattern in name_patterns: + part = chromtable[chromtable["name"].str.contains(pattern)] + part = part.iloc[argnatsort(part["name"])] + parts.append(part) + chromtable = pd.concat(parts, axis=0) + + chromtable.index = chromtable["name"].values + return chromtable["length"] + +def fetch_chromsizes(db: str, **kwargs) -> pd.Series: + """ + Download chromosome sizes from UCSC as a :py:class:`pandas.Series`, indexed + by chromosome label. + + """ + return read_chromsizes( + f"http://hgdownload.soe.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz", + **kwargs, + ) + + +def load_fasta(names: list[str], *filepaths: str) -> OrderedDict[str, Any]: + """ + Load lazy FASTA records from one or multiple files without reading them + into memory. + + Parameters + ---------- + names : sequence of str + Names of sequence records in FASTA file or files. + filepaths : str + Paths to one or more FASTA files to gather records from. + + Returns + ------- + OrderedDict of sequence name -> sequence record + + """ + import pyfaidx + + if len(filepaths) == 0: + raise ValueError("Need at least one file") + + if len(filepaths) == 1: + fa = pyfaidx.Fasta(filepaths[0], as_raw=True) + + else: + fa = {} + for filepath in filepaths: + fa.update(pyfaidx.Fasta(filepath, as_raw=True).records) + + records = OrderedDict((chrom, fa[chrom]) for chrom in names) + return records + + +def binnify(chromsizes: pd.Series, binsize: int) -> pd.DataFrame: + """ + Divide a genome into evenly sized bins. + + Parameters + ---------- + chromsizes : Series + pandas Series indexed by chromosome name with chromosome lengths in bp. + binsize : int + size of bins in bp + + Returns + ------- + bins : :py:class:`pandas.DataFrame` + Dataframe with columns: ``chrom``, ``start``, ``end``. + + """ + + def _each(chrom): + clen = chromsizes[chrom] + n_bins = int(np.ceil(clen / binsize)) + binedges = np.arange(0, (n_bins + 1)) * binsize + binedges[-1] = clen + return pd.DataFrame( + {"chrom": [chrom] * n_bins, "start": binedges[:-1], "end": binedges[1:]}, + columns=["chrom", "start", "end"], + ) + + bintable = pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True) + + bintable["chrom"] = pd.Categorical( + bintable["chrom"], categories=list(chromsizes.index), ordered=True + ) + + return bintable + + +make_bintable = binnify + + +def digest(fasta_records: OrderedDict[str, Any], enzyme: str) -> pd.DataFrame: + """ + Divide a genome into restriction fragments. + + Parameters + ---------- + fasta_records : OrderedDict + Dictionary of chromosome names to sequence records. + enzyme: str + Name of restriction enzyme (e.g., 'DpnII'). + + Returns + ------- + frags : :py:class:`pandas.DataFrame` + Dataframe with columns: ``chrom``, ``start``, ``end``. + + """ + try: + import Bio.Restriction as biorst + import Bio.Seq as bioseq + except ImportError: + raise ImportError( + "Biopython is required to find restriction fragments." + ) from None + + # http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698 + chroms = fasta_records.keys() + try: + cut_finder = getattr(biorst, enzyme).search + except AttributeError as e: + raise ValueError(f"Unknown enzyme name: {enzyme}") from e + + def _each(chrom): + seq = bioseq.Seq(str(fasta_records[chrom][:])) + cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(np.int64) + n_frags = len(cuts) - 1 + + frags = pd.DataFrame( + {"chrom": [chrom] * n_frags, "start": cuts[:-1], "end": cuts[1:]}, + columns=["chrom", "start", "end"], + ) + return frags + + return pd.concat(map(_each, chroms), axis=0, ignore_index=True) + + +def get_binsize(bins: pd.DataFrame) -> int | None: + """ + Infer bin size from a bin DataFrame. Assumes that the last bin of each + contig is allowed to differ in size from the rest. + + Returns + ------- + int or None if bins are non-uniform + + """ + sizes = set() + for _chrom, group in bins.groupby("chrom", observed=True): + sizes.update((group["end"] - group["start"]).iloc[:-1].unique()) + if len(sizes) > 1: + return None + if len(sizes) == 1: + return next(iter(sizes)) + else: + return None + + +def get_chromsizes(bins: pd.DataFrame) -> pd.Series: + """ + Infer chromsizes Series from a bin DataFrame. Assumes that the last bin of + each contig is allowed to differ in size from the rest. + + Returns + ------- + int or None if bins are non-uniform + + """ + chromtable = ( + bins.drop_duplicates(["chrom"], keep="last")[["chrom", "end"]] + .reset_index(drop=True) + .rename(columns={"chrom": "name", "end": "length"}) + ) + chroms, lengths = list(chromtable["name"]), list(chromtable["length"]) + return pd.Series(index=chroms, data=lengths) + + +def bedslice( + grouped, + chromsizes: pd.Series | dict, + region: GenomicRangeSpecifier, +) -> pd.DataFrame: + """ + Range query on a BED-like dataframe with non-overlapping intervals. + + """ + chrom, start, end = parse_region(region, chromsizes) + result = grouped.get_group(chrom) + if start > 0 or end < chromsizes[chrom]: + lo = result["end"].values.searchsorted(start, side="right") + hi = lo + result["start"].values[lo:].searchsorted(end, side="left") + result = result.iloc[lo:hi] + return result + + +def asarray_or_dataset(x: Any) -> np.ndarray | h5py.Dataset: + return x if isinstance(x, h5py.Dataset) else np.asarray(x) + + +def rlencode( + array: np.ndarray, + chunksize: int | None = None +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Run length encoding. + Based on http://stackoverflow.com/a/32681075, which is based on the rle + function from R. + + Parameters + ---------- + x : 1D array_like + Input array to encode + dropna: bool, optional + Drop all runs of NaNs. + + Returns + ------- + start positions, run lengths, run values + + """ + where = np.flatnonzero + array = asarray_or_dataset(array) + n = len(array) + if n == 0: + return ( + np.array([], dtype=int), + np.array([], dtype=int), + np.array([], dtype=array.dtype), + ) + + if chunksize is None: + chunksize = n + + starts, values = [], [] + last_val = np.nan + for i in range(0, n, chunksize): + x = array[i : i + chunksize] + locs = where(x[1:] != x[:-1]) + 1 + if x[0] != last_val: + locs = np.r_[0, locs] + starts.append(i + locs) + values.append(x[locs]) + last_val = x[-1] + starts = np.concatenate(starts) + lengths = np.diff(np.r_[starts, n]) + values = np.concatenate(values) + + return starts, lengths, values + + +def cmd_exists(cmd: str) -> bool: + return any( + os.access(os.path.join(path, cmd), os.X_OK) + for path in os.environ["PATH"].split(os.pathsep) + ) + + +def mad(data: np.ndarray, axis: int | None = None) -> np.ndarray: + return np.median(np.abs(data - np.median(data, axis)), axis) + + +@contextmanager +def open_hdf5( + fp: str | h5py.Group, + mode: str = "r", + *args, + **kwargs +) -> Generator[h5py.Group, None, None]: + """ + Context manager like ``h5py.File`` but accepts already open HDF5 file + handles which do not get closed on teardown. + + Parameters + ---------- + fp : str or ``h5py.File`` object + If an open file object is provided, it passes through unchanged, + provided that the requested mode is compatible. + If a filepath is passed, the context manager will close the file on + tear down. + + mode : str + * r Readonly, file must exist + * r+ Read/write, file must exist + * a Read/write if exists, create otherwise + * w Truncate if exists, create otherwise + * w- or x Fail if exists, create otherwise + + """ + if isinstance(fp, str): + own_fh = True + fh = h5py.File(fp, mode, *args, **kwargs) + else: + own_fh = False + if mode == "r" and fp.file.mode == "r+": + # warnings.warn("File object provided is writeable but intent is read-only") + pass + elif mode in ("r+", "a") and fp.file.mode == "r": + raise ValueError("File object provided is not writeable") + elif mode == "w": + raise ValueError("Cannot truncate open file") + elif mode in ("w-", "x"): + raise ValueError("File exists") + fh = fp + try: + yield fh + finally: + if own_fh: + fh.close() + + +class closing_hdf5(h5py.Group): + def __init__(self, grp: h5py.Group): + super().__init__(grp.id) + + def __enter__(self) -> h5py.Group: + return self + + def __exit__(self, *exc_info) -> None: + return self.file.close() + + def close(self) -> None: + self.file.close() + + +def attrs_to_jsonable(attrs: h5py.AttributeManager) -> dict: + out = dict(attrs) + for k, v in attrs.items(): + try: + out[k] = v.item() + except ValueError: + out[k] = v.tolist() + except AttributeError: + out[k] = v + return out + + +def infer_meta(x, index=None): # pragma: no cover + """ + Extracted and modified from dask/dataframe/utils.py : + make_meta (BSD licensed) + + Create an empty pandas object containing the desired metadata. + + Parameters + ---------- + x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar + To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or + an iterable of `(name, dtype)` tuples. To create a `Series`, provide a + tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index + should match the desired output. If a dtype or scalar, a scalar of the + same dtype is returned. + index : pd.Index, optional + Any pandas index to use in the metadata. If none provided, a + `RangeIndex` will be used. + + Examples + -------- + >>> make_meta([('a', 'i8'), ('b', 'O')]) + Empty DataFrame + Columns: [a, b] + Index: [] + >>> make_meta(('a', 'f8')) + Series([], Name: a, dtype: float64) + >>> make_meta('i8') + 1 + + """ + + _simple_fake_mapping = { + "b": np.bool_(True), + "V": np.void(b" "), + "M": np.datetime64("1970-01-01"), + "m": np.timedelta64(1), + "S": np.str_("foo"), + "a": np.str_("foo"), + "U": np.str_("foo"), + "O": "foo", + } + + UNKNOWN_CATEGORIES = "__UNKNOWN_CATEGORIES__" + + def _scalar_from_dtype(dtype): + if dtype.kind in ("i", "f", "u"): + return dtype.type(1) + elif dtype.kind == "c": + return dtype.type(complex(1, 0)) + elif dtype.kind in _simple_fake_mapping: + o = _simple_fake_mapping[dtype.kind] + return o.astype(dtype) if dtype.kind in ("m", "M") else o + else: + raise TypeError(f"Can't handle dtype: {dtype}") + + def _nonempty_scalar(x): + if isinstance(x, (pd.Timestamp, pd.Timedelta, pd.Period)): + return x + elif np.isscalar(x): + dtype = x.dtype if hasattr(x, "dtype") else np.dtype(type(x)) + return _scalar_from_dtype(dtype) + else: + raise TypeError("Can't handle meta of type " f"'{type(x).__name__}'") + + def _empty_series(name, dtype, index=None): + if isinstance(dtype, str) and dtype == "category": + return pd.Series( + pd.Categorical([UNKNOWN_CATEGORIES]), name=name, index=index + ).iloc[:0] + return pd.Series([], dtype=dtype, name=name, index=index) + + if hasattr(x, "_meta"): + return x._meta + if isinstance(x, (pd.Series, pd.DataFrame)): + return x.iloc[0:0] + elif isinstance(x, pd.Index): + return x[0:0] + index = index if index is None else index[0:0] + + if isinstance(x, dict): + return pd.DataFrame( + {c: _empty_series(c, d, index=index) for (c, d) in x.items()}, index=index + ) + if isinstance(x, tuple) and len(x) == 2: + return _empty_series(x[0], x[1], index=index) + elif isinstance(x, (list, tuple)): + if not all(isinstance(i, tuple) and len(i) == 2 for i in x): + raise ValueError( + "Expected iterable of tuples of (name, dtype), " f"got {x}" + ) + return pd.DataFrame( + {c: _empty_series(c, d, index=index) for (c, d) in x}, + columns=[c for c, d in x], + index=index, + ) + elif not hasattr(x, "dtype") and x is not None: + # could be a string, a dtype object, or a python type. Skip `None`, + # because it is implictly converted to `dtype('f8')`, which we don't + # want here. + try: + dtype = np.dtype(x) + return _scalar_from_dtype(dtype) + except: # noqa + # Continue on to next check + pass + + if is_scalar(x): + return _nonempty_scalar(x) + + raise TypeError(f"Don't know how to create metadata from {x}") + + +def get_meta( + columns, dtype=None, index_columns=None, index_names=None, default_dtype=np.object_ +): # pragma: no cover + """ + Extracted and modified from pandas/io/parsers.py : + _get_empty_meta (BSD licensed). + + """ + columns = list(columns) + + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not isinstance(dtype, dict): + # if dtype == None, default will be default_dtype. + dtype = defaultdict(lambda: dtype or default_dtype) + else: + # Save a copy of the dictionary. + _dtype = dtype.copy() + dtype = defaultdict(lambda: default_dtype) + + # Convert column indexes to column names. + for k, v in _dtype.items(): + col = columns[k] if is_integer(k) else k + dtype[col] = v + + if index_columns is None or index_columns is False: + index = pd.Index([]) + else: + data = [pd.Series([], dtype=dtype[name]) for name in index_names] + if len(data) == 1: + index = pd.Index(data[0], name=index_names[0]) + else: + index = pd.MultiIndex.from_arrays(data, names=index_names) + index_columns.sort() + for i, n in enumerate(index_columns): + columns.pop(n - i) + + col_dict = {col_name: pd.Series([], dtype=dtype[col_name]) for col_name in columns} + + return pd.DataFrame(col_dict, columns=columns, index=index) + + +def check_bins(bins: pd.DataFrame, chromsizes: pd.Series) -> pd.DataFrame: + is_cat = isinstance(bins["chrom"].dtype, pd.CategoricalDtype) + bins = bins.copy() + if not is_cat: + bins["chrom"] = pd.Categorical( + bins.chrom, categories=list(chromsizes.index), ordered=True + ) + else: + assert (bins["chrom"].cat.categories == chromsizes.index).all() + + return bins + + +def balanced_partition( + gs: GenomeSegmentation, + n_chunk_max: int, + file_contigs: list[str], + loadings: list[int | float] | None = None +) -> list[GenomicRangeTuple]: + # n_bins = len(gs.bins) + grouped = gs._bins_grouped + + chrom_nbins = grouped.size() + if loadings is None: + loadings = chrom_nbins + chrmax = loadings.idxmax() + loadings = loadings / loadings.loc[chrmax] + const = chrom_nbins.loc[chrmax] / n_chunk_max + + granges = [] + for chrom, group in grouped: + if chrom not in file_contigs: + continue + clen = gs.chromsizes[chrom] + step = int(np.ceil(const / loadings.loc[chrom])) + anchors = group.start.values[::step] + if anchors[-1] != clen: + anchors = np.r_[anchors, clen] + granges.extend( + (chrom, start, end) for start, end in zip(anchors[:-1], anchors[1:]) + ) + return granges + + +class GenomeSegmentation: + def __init__(self, chromsizes: pd.Series, bins: pd.DataFrame): + bins = check_bins(bins, chromsizes) + self._bins_grouped = bins.groupby("chrom", observed=True, sort=False) + nbins_per_chrom = self._bins_grouped.size().values + + self.chromsizes = chromsizes + self.binsize = get_binsize(bins) + self.contigs = list(chromsizes.keys()) + self.bins = bins + self.idmap = pd.Series(index=chromsizes.keys(), data=range(len(chromsizes))) + self.chrom_binoffset = np.r_[0, np.cumsum(nbins_per_chrom)] + self.chrom_abspos = np.r_[0, np.cumsum(chromsizes.values)] + self.start_abspos = ( + self.chrom_abspos[bins["chrom"].cat.codes] + bins["start"].values + ) + + def fetch(self, region: GenomicRangeSpecifier) -> pd.DataFrame: + chrom, start, end = parse_region(region, self.chromsizes) + result = self._bins_grouped.get_group(chrom) + if start > 0 or end < self.chromsizes[chrom]: + lo = result["end"].values.searchsorted(start, side="right") + hi = lo + result["start"].values[lo:].searchsorted(end, side="left") + result = result.iloc[lo:hi] + return result + + +def buffered( + chunks: Iterable[pd.DataFrame], + size: int = 10000000 +) -> Iterator[pd.DataFrame]: + """ + Take an incoming iterator of small data frame chunks and buffer them into + an outgoing iterator of larger chunks. + + Parameters + ---------- + chunks : iterator of :py:class:`pandas.DataFrame` + Each chunk should have the same column names. + size : int + Minimum length of output chunks. + + Yields + ------ + Larger outgoing :py:class:`pandas.DataFrame` chunks made from concatenating + the incoming ones. + + """ + buf = [] + n = 0 + for chunk in chunks: + n += len(chunk) + buf.append(chunk) + if n > size: + yield pd.concat(buf, axis=0) + buf = [] + n = 0 + if len(buf): + yield pd.concat(buf, axis=0) diff --git a/tests/test_chromsize_check.py b/tests/test_chromsize_check.py index a3507db8..058d8b56 100644 --- a/tests/test_chromsize_check.py +++ b/tests/test_chromsize_check.py @@ -1,72 +1,33 @@ +from cooler.util import ( + read_chromsizes, # Correctly import from the 'cooler.util' module +) + import io -from typing import TextIO # Import TextIO for type hinting -import pandas as pd import pytest -from numpy import argsort as argnatsort - - -# Updated read_chromsizes function with TextIO for Python 3.11+ -def read_chromsizes( - filepath_or: str | TextIO, # Use TextIO for file-like objects - name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), - all_names: bool = False, - **kwargs, -) -> pd.Series: - """ - Parse a `.chrom.sizes or .chromInfo.txt file from the UCSC - database, where `db` is a genome assembly name. - """ - # Handle .gz files - if isinstance(filepath_or, str) and filepath_or.endswith(".gz"): - kwargs.setdefault("compression", "gzip") - - # Read the chromosome size file into a DataFrame - chromtable = pd.read_csv( - filepath_or, - sep="\t", - usecols=[0, 1], - names=["name", "length"], - dtype={"name": str}, - **kwargs, - ) - - # Convert the "length" column to numeric values, coercing errors to NaN - chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") - - # Raise an error if there are any invalid (NaN) lengths - if chromtable["length"].isnull().any(): - raise ValueError( - f"Chromsizes file '{filepath_or}' contains missing or invalid " - "length values. Please ensure that the file is properly formatted " - "as tab-delimited with two columns: sequence name and integer " - "length. Check for extraneous spaces or hidden characters." - ) - - # Filter chromosomes by pattern and sort them - if not all_names: - parts = [] - for pattern in name_patterns: - part = chromtable[chromtable["name"].str.contains(pattern)] - part = part.iloc[argnatsort(part["name"])] - parts.append(part) - chromtable = pd.concat(parts, axis=0) - # Set the chromosome names as the index - chromtable.index = chromtable["name"].values - return chromtable["length"] - - -# Test for the read_chromsizes function with invalid length value +# Test for the read_chromsizes function with invalid length value (non-numeric value) def test_read_chromsizes_bad_input(): + # Simulating a bad .chrom.sizes file (with non-numeric value in the 'length' column) broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n" broken_file = io.StringIO(broken_data) # Expect a ValueError due to the non-numeric value in the "length" column. - with pytest.raises(ValueError, match="Chromsizes file '.*' invalid length values"): + with pytest.raises(ValueError, match=r"Chromsizes file '.*' contains missing or invalid length values"): + read_chromsizes(broken_file) + + +# Test for the read_chromsizes function with space delimiter instead of tab delimiter +def test_read_chromsizes_bad_delimiter(): + # Simulating a .chrom.sizes file with space delimiters (instead of tabs) + broken_data = "chr1 1000\nchr2 bad_value\nchr3 2000\n" + broken_file = io.StringIO(broken_data) + + # Expect a ValueError due to space delimiter (not tab) in the file + with pytest.raises(ValueError, match=r"Chromsizes file '.*' uses spaces instead of tabs as delimiters. Please use tabs."): read_chromsizes(broken_file) -# Main function to run the test +# Main function to run the tests if __name__ == "__main__": pytest.main([__file__]) From 55986ad0cb0817d3aca1eccd3d756f8ebf4c679a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 1 Mar 2025 07:13:57 +0000 Subject: [PATCH 04/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_chromsize_check.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_chromsize_check.py b/tests/test_chromsize_check.py index 058d8b56..4c21df47 100644 --- a/tests/test_chromsize_check.py +++ b/tests/test_chromsize_check.py @@ -1,10 +1,11 @@ +import io + +import pytest + from cooler.util import ( read_chromsizes, # Correctly import from the 'cooler.util' module ) -import io - -import pytest # Test for the read_chromsizes function with invalid length value (non-numeric value) def test_read_chromsizes_bad_input(): From 34bb0700848f1ea80079a050186cc87fa8e89475 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 3 Mar 2025 20:46:52 -0500 Subject: [PATCH 05/16] Move tests to test_util and fix line lengths --- tests/test_chromsize_check.py | 34 ---------------------------------- tests/test_util.py | 22 +++++++++++++++++++++- 2 files changed, 21 insertions(+), 35 deletions(-) delete mode 100644 tests/test_chromsize_check.py diff --git a/tests/test_chromsize_check.py b/tests/test_chromsize_check.py deleted file mode 100644 index 4c21df47..00000000 --- a/tests/test_chromsize_check.py +++ /dev/null @@ -1,34 +0,0 @@ -import io - -import pytest - -from cooler.util import ( - read_chromsizes, # Correctly import from the 'cooler.util' module -) - - -# Test for the read_chromsizes function with invalid length value (non-numeric value) -def test_read_chromsizes_bad_input(): - # Simulating a bad .chrom.sizes file (with non-numeric value in the 'length' column) - broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n" - broken_file = io.StringIO(broken_data) - - # Expect a ValueError due to the non-numeric value in the "length" column. - with pytest.raises(ValueError, match=r"Chromsizes file '.*' contains missing or invalid length values"): - read_chromsizes(broken_file) - - -# Test for the read_chromsizes function with space delimiter instead of tab delimiter -def test_read_chromsizes_bad_delimiter(): - # Simulating a .chrom.sizes file with space delimiters (instead of tabs) - broken_data = "chr1 1000\nchr2 bad_value\nchr3 2000\n" - broken_file = io.StringIO(broken_data) - - # Expect a ValueError due to space delimiter (not tab) in the file - with pytest.raises(ValueError, match=r"Chromsizes file '.*' uses spaces instead of tabs as delimiters. Please use tabs."): - read_chromsizes(broken_file) - - -# Main function to run the tests -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/tests/test_util.py b/tests/test_util.py index b3f94be5..dcf1826b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,5 +1,5 @@ import os.path as op -from io import BytesIO +from io import BytesIO, StringIO import h5py import numpy as np @@ -166,6 +166,26 @@ def test_read_chromsizes(): util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")) +def test_read_chromsizes_bad_input(): + broken_data = "chr1\t1000\nchr2\tbad_value\nchr3\t2000\n" + broken_file = StringIO(broken_data) + with pytest.raises(ValueError): + util.read_chromsizes(broken_file) + + +def test_read_chromsizes_bad_delimiter(): + broken_data = "chr1 1000\nchr2 bad_value\nchr3 2000\n" + broken_file = StringIO(broken_data) + with pytest.raises(ValueError): + util.read_chromsizes(broken_file) + + +# Main function to run the tests +if __name__ == "__main__": + pytest.main([__file__]) + + + # def test_fetch_chromsizes(): # util.fetch_chromsizes("hg19") From db29975aa3f633d4dfc95edb366a71472530e946 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 3 Mar 2025 20:48:41 -0500 Subject: [PATCH 06/16] Remove carriage returns and fix line lengths --- src/cooler/util.py | 1748 ++++++++++++++++++++++---------------------- 1 file changed, 877 insertions(+), 871 deletions(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index b5fd08ad..44fb853d 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -1,871 +1,877 @@ -from __future__ import annotations - -import io -import os -import re -from collections import OrderedDict, defaultdict -from collections.abc import Generator, Iterable, Iterator -from contextlib import contextmanager -from typing import Any - -import h5py -import numpy as np -import pandas as pd -from pandas.api.types import is_integer, is_scalar - -from ._typing import GenomicRangeSpecifier, GenomicRangeTuple - - -def partition(start: int, stop: int, step: int) -> Iterator[tuple[int, int]]: - """Partition an integer interval into equally-sized subintervals. - Like builtin :py:func:`range`, but yields pairs of end points. - - Examples - -------- - >>> for lo, hi in partition(0, 9, 2): - print(lo, hi) - 0 2 - 2 4 - 4 6 - 6 8 - 8 9 - - """ - return ((i, min(i + step, stop)) for i in range(start, stop, step)) - - -def parse_cooler_uri(s: str) -> tuple[str, str]: - """ - Parse a Cooler URI string - - e.g. /path/to/mycoolers.cool::/path/to/cooler - - """ - parts = s.split("::") - if len(parts) == 1: - file_path, group_path = parts[0], "/" - elif len(parts) == 2: - file_path, group_path = parts - if not group_path.startswith("/"): - group_path = "/" + group_path - else: - raise ValueError("Invalid Cooler URI string") - return file_path, group_path - - -def atoi(s: str) -> int: - return int(s.replace(",", "")) - - -def parse_humanized(s: str) -> int: - _NUMERIC_RE = re.compile("([0-9,.]+)") - _, value, unit = _NUMERIC_RE.split(s.replace(",", "")) - if not len(unit): - return int(value) - - value = float(value) - unit = unit.upper().strip() - if unit in ("K", "KB"): - value *= 1000 - elif unit in ("M", "MB"): - value *= 1000000 - elif unit in ("G", "GB"): - value *= 1000000000 - else: - raise ValueError(f"Unknown unit '{unit}'") - return int(value) - - -def parse_region_string(s: str) -> tuple[str, int | None, int | None]: - """ - Parse a UCSC-style genomic region string into a triple. - - Parameters - ---------- - s : str - UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA - style sequence names are allowed. End coordinate must be greater than - or equal to start. - - Returns - ------- - (str, int or None, int or None) - - """ - - def _tokenize(s): - token_spec = [ - ("HYPHEN", r"-"), - ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"), - ("OTHER", r".+"), - ] - pattern = r"|\s*".join([rf"(?P<{pair[0]}>{pair[1]})" for pair in token_spec]) - tok_regex = re.compile(rf"\s*{pattern}", re.IGNORECASE) - for match in tok_regex.finditer(s): - typ = match.lastgroup - yield typ, match.group(typ) - - def _check_token(typ, token, expected): - if typ is None: - raise ValueError("Expected {} token missing".format(" or ".join(expected))) - else: - if typ not in expected: - raise ValueError(f'Unexpected token "{token}"') - - def _expect(tokens): - typ, token = next(tokens, (None, None)) - _check_token(typ, token, ["COORD"]) - start = parse_humanized(token) - - typ, token = next(tokens, (None, None)) - _check_token(typ, token, ["HYPHEN"]) - - typ, token = next(tokens, (None, None)) - if typ is None: - return start, None - - _check_token(typ, token, ["COORD"]) - end = parse_humanized(token) - if end < start: - raise ValueError("End coordinate less than start") - - return start, end - - parts = s.split(":") - chrom = parts[0].strip() - if not len(chrom): - raise ValueError("Chromosome name cannot be empty") - if len(parts) < 2: - return (chrom, None, None) - start, end = _expect(_tokenize(parts[1])) - return (chrom, start, end) - - -def parse_region( - reg: GenomicRangeSpecifier, - chromsizes: dict | pd.Series | None = None -) -> GenomicRangeTuple: - """ - Genomic regions are represented as half-open intervals (0-based starts, - 1-based ends) along the length coordinate of a contig/scaffold/chromosome. - - Parameters - ---------- - reg : str or tuple - UCSC-style genomic region string, or - Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``. - chromsizes : mapping, optional - Lookup table of scaffold lengths to check against ``chrom`` and the - ``end`` coordinate. Required if ``end`` is not supplied. - - Returns - ------- - A well-formed genomic region triple (str, int, int) - - """ - if isinstance(reg, str): - chrom, start, end = parse_region_string(reg) - else: - chrom, start, end = reg - start = int(start) if start is not None else start - end = int(end) if end is not None else end - - try: - clen = chromsizes[chrom] if chromsizes is not None else None - except KeyError as e: - raise ValueError(f"Unknown sequence label: {chrom}") from e - - start = 0 if start is None else start - if end is None: - if clen is None: # TODO --- remove? - raise ValueError("Cannot determine end coordinate.") - end = clen - - if end < start: - raise ValueError("End cannot be less than start") - - if start < 0 or (clen is not None and end > clen): - raise ValueError(f"Genomic region out of bounds: [{start}, {end})") - - return chrom, start, end - - -def natsort_key(s: str, _NS_REGEX=re.compile(r"(\d+)", re.U)) -> tuple: - return tuple([int(x) if x.isdigit() else x for x in _NS_REGEX.split(s) if x]) - - -def natsorted(iterable: Iterable[str]) -> list[str]: - return sorted(iterable, key=natsort_key) - - -def argnatsort(array: Iterable[str]) -> np.ndarray: - array = np.asarray(array) - if not len(array): - return np.array([], dtype=int) - cols = tuple(zip(*(natsort_key(x) for x in array))) - return np.lexsort(cols[::-1]) - -def read_chromsizes( - filepath_or: str | io.StringIO, - name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), - all_names: bool = False, - verbose: bool = False, # Optional parameter to enable verbose output - **kwargs, -) -> pd.Series: - """ - Parse a `.chrom.sizes` or `.chromInfo.txt` file from the UCSC - database, where `db` is a genome assembly name. - - Parameters - ---------- - filepath_or : str or file-like - Path or url to text file, or buffer. - name_patterns : sequence, optional - Sequence of regular expressions to capture desired sequence names. - all_names : bool, optional - Whether to return all contigs listed in the file. - verbose : bool, optional - Whether to enable verbose logging for diagnostics. - """ - # Check if the input is a file-like object (StringIO or file path) and inspect the first line for delimiters - if isinstance(filepath_or, (str, io.StringIO)): - first_line = None - if isinstance(filepath_or, io.StringIO): - first_line = filepath_or.getvalue().splitlines()[0] - elif isinstance(filepath_or, str): - with open(filepath_or) as file: - first_line = file.readline() - - if first_line and ' ' in first_line: - raise ValueError(f"Chromsizes file '{filepath_or}' uses spaces instead of tabs as delimiters. Please use tabs.") - - # Read the chromosome size file into a DataFrame - if verbose: - print(f"Reading chromsizes file: {filepath_or}") - - chromtable = pd.read_csv( - filepath_or, - sep="\t", # Ensuring tab is the delimiter - usecols=[0, 1], - names=["name", "length"], - dtype={"name": str}, - **kwargs, - ) - - - # Convert the "length" column to numeric values, coercing errors to NaN - chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") - - # Check for NaN values after reading the file - if chromtable["length"].isnull().any(): - invalid_rows = chromtable[chromtable["length"].isnull()] - if verbose: - print(f"Invalid rows detected: {invalid_rows}") - raise ValueError( - f"Chromsizes file '{filepath_or}' contains missing or invalid length values. " - "Please ensure that the file is properly formatted as tab-delimited with two columns: sequence name and integer length. " - "Check for extraneous spaces or hidden characters. Invalid rows: \n{invalid_rows}" - ) - - # Filter by patterns if needed - if not all_names: - parts = [] - for pattern in name_patterns: - part = chromtable[chromtable["name"].str.contains(pattern)] - part = part.iloc[argnatsort(part["name"])] - parts.append(part) - chromtable = pd.concat(parts, axis=0) - - chromtable.index = chromtable["name"].values - return chromtable["length"] - -def fetch_chromsizes(db: str, **kwargs) -> pd.Series: - """ - Download chromosome sizes from UCSC as a :py:class:`pandas.Series`, indexed - by chromosome label. - - """ - return read_chromsizes( - f"http://hgdownload.soe.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz", - **kwargs, - ) - - -def load_fasta(names: list[str], *filepaths: str) -> OrderedDict[str, Any]: - """ - Load lazy FASTA records from one or multiple files without reading them - into memory. - - Parameters - ---------- - names : sequence of str - Names of sequence records in FASTA file or files. - filepaths : str - Paths to one or more FASTA files to gather records from. - - Returns - ------- - OrderedDict of sequence name -> sequence record - - """ - import pyfaidx - - if len(filepaths) == 0: - raise ValueError("Need at least one file") - - if len(filepaths) == 1: - fa = pyfaidx.Fasta(filepaths[0], as_raw=True) - - else: - fa = {} - for filepath in filepaths: - fa.update(pyfaidx.Fasta(filepath, as_raw=True).records) - - records = OrderedDict((chrom, fa[chrom]) for chrom in names) - return records - - -def binnify(chromsizes: pd.Series, binsize: int) -> pd.DataFrame: - """ - Divide a genome into evenly sized bins. - - Parameters - ---------- - chromsizes : Series - pandas Series indexed by chromosome name with chromosome lengths in bp. - binsize : int - size of bins in bp - - Returns - ------- - bins : :py:class:`pandas.DataFrame` - Dataframe with columns: ``chrom``, ``start``, ``end``. - - """ - - def _each(chrom): - clen = chromsizes[chrom] - n_bins = int(np.ceil(clen / binsize)) - binedges = np.arange(0, (n_bins + 1)) * binsize - binedges[-1] = clen - return pd.DataFrame( - {"chrom": [chrom] * n_bins, "start": binedges[:-1], "end": binedges[1:]}, - columns=["chrom", "start", "end"], - ) - - bintable = pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True) - - bintable["chrom"] = pd.Categorical( - bintable["chrom"], categories=list(chromsizes.index), ordered=True - ) - - return bintable - - -make_bintable = binnify - - -def digest(fasta_records: OrderedDict[str, Any], enzyme: str) -> pd.DataFrame: - """ - Divide a genome into restriction fragments. - - Parameters - ---------- - fasta_records : OrderedDict - Dictionary of chromosome names to sequence records. - enzyme: str - Name of restriction enzyme (e.g., 'DpnII'). - - Returns - ------- - frags : :py:class:`pandas.DataFrame` - Dataframe with columns: ``chrom``, ``start``, ``end``. - - """ - try: - import Bio.Restriction as biorst - import Bio.Seq as bioseq - except ImportError: - raise ImportError( - "Biopython is required to find restriction fragments." - ) from None - - # http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698 - chroms = fasta_records.keys() - try: - cut_finder = getattr(biorst, enzyme).search - except AttributeError as e: - raise ValueError(f"Unknown enzyme name: {enzyme}") from e - - def _each(chrom): - seq = bioseq.Seq(str(fasta_records[chrom][:])) - cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(np.int64) - n_frags = len(cuts) - 1 - - frags = pd.DataFrame( - {"chrom": [chrom] * n_frags, "start": cuts[:-1], "end": cuts[1:]}, - columns=["chrom", "start", "end"], - ) - return frags - - return pd.concat(map(_each, chroms), axis=0, ignore_index=True) - - -def get_binsize(bins: pd.DataFrame) -> int | None: - """ - Infer bin size from a bin DataFrame. Assumes that the last bin of each - contig is allowed to differ in size from the rest. - - Returns - ------- - int or None if bins are non-uniform - - """ - sizes = set() - for _chrom, group in bins.groupby("chrom", observed=True): - sizes.update((group["end"] - group["start"]).iloc[:-1].unique()) - if len(sizes) > 1: - return None - if len(sizes) == 1: - return next(iter(sizes)) - else: - return None - - -def get_chromsizes(bins: pd.DataFrame) -> pd.Series: - """ - Infer chromsizes Series from a bin DataFrame. Assumes that the last bin of - each contig is allowed to differ in size from the rest. - - Returns - ------- - int or None if bins are non-uniform - - """ - chromtable = ( - bins.drop_duplicates(["chrom"], keep="last")[["chrom", "end"]] - .reset_index(drop=True) - .rename(columns={"chrom": "name", "end": "length"}) - ) - chroms, lengths = list(chromtable["name"]), list(chromtable["length"]) - return pd.Series(index=chroms, data=lengths) - - -def bedslice( - grouped, - chromsizes: pd.Series | dict, - region: GenomicRangeSpecifier, -) -> pd.DataFrame: - """ - Range query on a BED-like dataframe with non-overlapping intervals. - - """ - chrom, start, end = parse_region(region, chromsizes) - result = grouped.get_group(chrom) - if start > 0 or end < chromsizes[chrom]: - lo = result["end"].values.searchsorted(start, side="right") - hi = lo + result["start"].values[lo:].searchsorted(end, side="left") - result = result.iloc[lo:hi] - return result - - -def asarray_or_dataset(x: Any) -> np.ndarray | h5py.Dataset: - return x if isinstance(x, h5py.Dataset) else np.asarray(x) - - -def rlencode( - array: np.ndarray, - chunksize: int | None = None -) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Run length encoding. - Based on http://stackoverflow.com/a/32681075, which is based on the rle - function from R. - - Parameters - ---------- - x : 1D array_like - Input array to encode - dropna: bool, optional - Drop all runs of NaNs. - - Returns - ------- - start positions, run lengths, run values - - """ - where = np.flatnonzero - array = asarray_or_dataset(array) - n = len(array) - if n == 0: - return ( - np.array([], dtype=int), - np.array([], dtype=int), - np.array([], dtype=array.dtype), - ) - - if chunksize is None: - chunksize = n - - starts, values = [], [] - last_val = np.nan - for i in range(0, n, chunksize): - x = array[i : i + chunksize] - locs = where(x[1:] != x[:-1]) + 1 - if x[0] != last_val: - locs = np.r_[0, locs] - starts.append(i + locs) - values.append(x[locs]) - last_val = x[-1] - starts = np.concatenate(starts) - lengths = np.diff(np.r_[starts, n]) - values = np.concatenate(values) - - return starts, lengths, values - - -def cmd_exists(cmd: str) -> bool: - return any( - os.access(os.path.join(path, cmd), os.X_OK) - for path in os.environ["PATH"].split(os.pathsep) - ) - - -def mad(data: np.ndarray, axis: int | None = None) -> np.ndarray: - return np.median(np.abs(data - np.median(data, axis)), axis) - - -@contextmanager -def open_hdf5( - fp: str | h5py.Group, - mode: str = "r", - *args, - **kwargs -) -> Generator[h5py.Group, None, None]: - """ - Context manager like ``h5py.File`` but accepts already open HDF5 file - handles which do not get closed on teardown. - - Parameters - ---------- - fp : str or ``h5py.File`` object - If an open file object is provided, it passes through unchanged, - provided that the requested mode is compatible. - If a filepath is passed, the context manager will close the file on - tear down. - - mode : str - * r Readonly, file must exist - * r+ Read/write, file must exist - * a Read/write if exists, create otherwise - * w Truncate if exists, create otherwise - * w- or x Fail if exists, create otherwise - - """ - if isinstance(fp, str): - own_fh = True - fh = h5py.File(fp, mode, *args, **kwargs) - else: - own_fh = False - if mode == "r" and fp.file.mode == "r+": - # warnings.warn("File object provided is writeable but intent is read-only") - pass - elif mode in ("r+", "a") and fp.file.mode == "r": - raise ValueError("File object provided is not writeable") - elif mode == "w": - raise ValueError("Cannot truncate open file") - elif mode in ("w-", "x"): - raise ValueError("File exists") - fh = fp - try: - yield fh - finally: - if own_fh: - fh.close() - - -class closing_hdf5(h5py.Group): - def __init__(self, grp: h5py.Group): - super().__init__(grp.id) - - def __enter__(self) -> h5py.Group: - return self - - def __exit__(self, *exc_info) -> None: - return self.file.close() - - def close(self) -> None: - self.file.close() - - -def attrs_to_jsonable(attrs: h5py.AttributeManager) -> dict: - out = dict(attrs) - for k, v in attrs.items(): - try: - out[k] = v.item() - except ValueError: - out[k] = v.tolist() - except AttributeError: - out[k] = v - return out - - -def infer_meta(x, index=None): # pragma: no cover - """ - Extracted and modified from dask/dataframe/utils.py : - make_meta (BSD licensed) - - Create an empty pandas object containing the desired metadata. - - Parameters - ---------- - x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar - To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or - an iterable of `(name, dtype)` tuples. To create a `Series`, provide a - tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index - should match the desired output. If a dtype or scalar, a scalar of the - same dtype is returned. - index : pd.Index, optional - Any pandas index to use in the metadata. If none provided, a - `RangeIndex` will be used. - - Examples - -------- - >>> make_meta([('a', 'i8'), ('b', 'O')]) - Empty DataFrame - Columns: [a, b] - Index: [] - >>> make_meta(('a', 'f8')) - Series([], Name: a, dtype: float64) - >>> make_meta('i8') - 1 - - """ - - _simple_fake_mapping = { - "b": np.bool_(True), - "V": np.void(b" "), - "M": np.datetime64("1970-01-01"), - "m": np.timedelta64(1), - "S": np.str_("foo"), - "a": np.str_("foo"), - "U": np.str_("foo"), - "O": "foo", - } - - UNKNOWN_CATEGORIES = "__UNKNOWN_CATEGORIES__" - - def _scalar_from_dtype(dtype): - if dtype.kind in ("i", "f", "u"): - return dtype.type(1) - elif dtype.kind == "c": - return dtype.type(complex(1, 0)) - elif dtype.kind in _simple_fake_mapping: - o = _simple_fake_mapping[dtype.kind] - return o.astype(dtype) if dtype.kind in ("m", "M") else o - else: - raise TypeError(f"Can't handle dtype: {dtype}") - - def _nonempty_scalar(x): - if isinstance(x, (pd.Timestamp, pd.Timedelta, pd.Period)): - return x - elif np.isscalar(x): - dtype = x.dtype if hasattr(x, "dtype") else np.dtype(type(x)) - return _scalar_from_dtype(dtype) - else: - raise TypeError("Can't handle meta of type " f"'{type(x).__name__}'") - - def _empty_series(name, dtype, index=None): - if isinstance(dtype, str) and dtype == "category": - return pd.Series( - pd.Categorical([UNKNOWN_CATEGORIES]), name=name, index=index - ).iloc[:0] - return pd.Series([], dtype=dtype, name=name, index=index) - - if hasattr(x, "_meta"): - return x._meta - if isinstance(x, (pd.Series, pd.DataFrame)): - return x.iloc[0:0] - elif isinstance(x, pd.Index): - return x[0:0] - index = index if index is None else index[0:0] - - if isinstance(x, dict): - return pd.DataFrame( - {c: _empty_series(c, d, index=index) for (c, d) in x.items()}, index=index - ) - if isinstance(x, tuple) and len(x) == 2: - return _empty_series(x[0], x[1], index=index) - elif isinstance(x, (list, tuple)): - if not all(isinstance(i, tuple) and len(i) == 2 for i in x): - raise ValueError( - "Expected iterable of tuples of (name, dtype), " f"got {x}" - ) - return pd.DataFrame( - {c: _empty_series(c, d, index=index) for (c, d) in x}, - columns=[c for c, d in x], - index=index, - ) - elif not hasattr(x, "dtype") and x is not None: - # could be a string, a dtype object, or a python type. Skip `None`, - # because it is implictly converted to `dtype('f8')`, which we don't - # want here. - try: - dtype = np.dtype(x) - return _scalar_from_dtype(dtype) - except: # noqa - # Continue on to next check - pass - - if is_scalar(x): - return _nonempty_scalar(x) - - raise TypeError(f"Don't know how to create metadata from {x}") - - -def get_meta( - columns, dtype=None, index_columns=None, index_names=None, default_dtype=np.object_ -): # pragma: no cover - """ - Extracted and modified from pandas/io/parsers.py : - _get_empty_meta (BSD licensed). - - """ - columns = list(columns) - - # Convert `dtype` to a defaultdict of some kind. - # This will enable us to write `dtype[col_name]` - # without worrying about KeyError issues later on. - if not isinstance(dtype, dict): - # if dtype == None, default will be default_dtype. - dtype = defaultdict(lambda: dtype or default_dtype) - else: - # Save a copy of the dictionary. - _dtype = dtype.copy() - dtype = defaultdict(lambda: default_dtype) - - # Convert column indexes to column names. - for k, v in _dtype.items(): - col = columns[k] if is_integer(k) else k - dtype[col] = v - - if index_columns is None or index_columns is False: - index = pd.Index([]) - else: - data = [pd.Series([], dtype=dtype[name]) for name in index_names] - if len(data) == 1: - index = pd.Index(data[0], name=index_names[0]) - else: - index = pd.MultiIndex.from_arrays(data, names=index_names) - index_columns.sort() - for i, n in enumerate(index_columns): - columns.pop(n - i) - - col_dict = {col_name: pd.Series([], dtype=dtype[col_name]) for col_name in columns} - - return pd.DataFrame(col_dict, columns=columns, index=index) - - -def check_bins(bins: pd.DataFrame, chromsizes: pd.Series) -> pd.DataFrame: - is_cat = isinstance(bins["chrom"].dtype, pd.CategoricalDtype) - bins = bins.copy() - if not is_cat: - bins["chrom"] = pd.Categorical( - bins.chrom, categories=list(chromsizes.index), ordered=True - ) - else: - assert (bins["chrom"].cat.categories == chromsizes.index).all() - - return bins - - -def balanced_partition( - gs: GenomeSegmentation, - n_chunk_max: int, - file_contigs: list[str], - loadings: list[int | float] | None = None -) -> list[GenomicRangeTuple]: - # n_bins = len(gs.bins) - grouped = gs._bins_grouped - - chrom_nbins = grouped.size() - if loadings is None: - loadings = chrom_nbins - chrmax = loadings.idxmax() - loadings = loadings / loadings.loc[chrmax] - const = chrom_nbins.loc[chrmax] / n_chunk_max - - granges = [] - for chrom, group in grouped: - if chrom not in file_contigs: - continue - clen = gs.chromsizes[chrom] - step = int(np.ceil(const / loadings.loc[chrom])) - anchors = group.start.values[::step] - if anchors[-1] != clen: - anchors = np.r_[anchors, clen] - granges.extend( - (chrom, start, end) for start, end in zip(anchors[:-1], anchors[1:]) - ) - return granges - - -class GenomeSegmentation: - def __init__(self, chromsizes: pd.Series, bins: pd.DataFrame): - bins = check_bins(bins, chromsizes) - self._bins_grouped = bins.groupby("chrom", observed=True, sort=False) - nbins_per_chrom = self._bins_grouped.size().values - - self.chromsizes = chromsizes - self.binsize = get_binsize(bins) - self.contigs = list(chromsizes.keys()) - self.bins = bins - self.idmap = pd.Series(index=chromsizes.keys(), data=range(len(chromsizes))) - self.chrom_binoffset = np.r_[0, np.cumsum(nbins_per_chrom)] - self.chrom_abspos = np.r_[0, np.cumsum(chromsizes.values)] - self.start_abspos = ( - self.chrom_abspos[bins["chrom"].cat.codes] + bins["start"].values - ) - - def fetch(self, region: GenomicRangeSpecifier) -> pd.DataFrame: - chrom, start, end = parse_region(region, self.chromsizes) - result = self._bins_grouped.get_group(chrom) - if start > 0 or end < self.chromsizes[chrom]: - lo = result["end"].values.searchsorted(start, side="right") - hi = lo + result["start"].values[lo:].searchsorted(end, side="left") - result = result.iloc[lo:hi] - return result - - -def buffered( - chunks: Iterable[pd.DataFrame], - size: int = 10000000 -) -> Iterator[pd.DataFrame]: - """ - Take an incoming iterator of small data frame chunks and buffer them into - an outgoing iterator of larger chunks. - - Parameters - ---------- - chunks : iterator of :py:class:`pandas.DataFrame` - Each chunk should have the same column names. - size : int - Minimum length of output chunks. - - Yields - ------ - Larger outgoing :py:class:`pandas.DataFrame` chunks made from concatenating - the incoming ones. - - """ - buf = [] - n = 0 - for chunk in chunks: - n += len(chunk) - buf.append(chunk) - if n > size: - yield pd.concat(buf, axis=0) - buf = [] - n = 0 - if len(buf): - yield pd.concat(buf, axis=0) +from __future__ import annotations + +import io +import os +import re +from collections import OrderedDict, defaultdict +from collections.abc import Generator, Iterable, Iterator +from contextlib import contextmanager +from typing import Any + +import h5py +import numpy as np +import pandas as pd +from pandas.api.types import is_integer, is_scalar + +from ._typing import GenomicRangeSpecifier, GenomicRangeTuple + + +def partition(start: int, stop: int, step: int) -> Iterator[tuple[int, int]]: + """Partition an integer interval into equally-sized subintervals. + Like builtin :py:func:`range`, but yields pairs of end points. + + Examples + -------- + >>> for lo, hi in partition(0, 9, 2): + print(lo, hi) + 0 2 + 2 4 + 4 6 + 6 8 + 8 9 + + """ + return ((i, min(i + step, stop)) for i in range(start, stop, step)) + + +def parse_cooler_uri(s: str) -> tuple[str, str]: + """ + Parse a Cooler URI string + + e.g. /path/to/mycoolers.cool::/path/to/cooler + + """ + parts = s.split("::") + if len(parts) == 1: + file_path, group_path = parts[0], "/" + elif len(parts) == 2: + file_path, group_path = parts + if not group_path.startswith("/"): + group_path = "/" + group_path + else: + raise ValueError("Invalid Cooler URI string") + return file_path, group_path + + +def atoi(s: str) -> int: + return int(s.replace(",", "")) + + +def parse_humanized(s: str) -> int: + _NUMERIC_RE = re.compile("([0-9,.]+)") + _, value, unit = _NUMERIC_RE.split(s.replace(",", "")) + if not len(unit): + return int(value) + + value = float(value) + unit = unit.upper().strip() + if unit in ("K", "KB"): + value *= 1000 + elif unit in ("M", "MB"): + value *= 1000000 + elif unit in ("G", "GB"): + value *= 1000000000 + else: + raise ValueError(f"Unknown unit '{unit}'") + return int(value) + + +def parse_region_string(s: str) -> tuple[str, int | None, int | None]: + """ + Parse a UCSC-style genomic region string into a triple. + + Parameters + ---------- + s : str + UCSC-style string, e.g. "chr5:10,100,000-30,000,000". Ensembl and FASTA + style sequence names are allowed. End coordinate must be greater than + or equal to start. + + Returns + ------- + (str, int or None, int or None) + + """ + + def _tokenize(s): + token_spec = [ + ("HYPHEN", r"-"), + ("COORD", r"[0-9,]+(\.[0-9]*)?(?:[a-z]+)?"), + ("OTHER", r".+"), + ] + pattern = r"|\s*".join([rf"(?P<{pair[0]}>{pair[1]})" for pair in token_spec]) + tok_regex = re.compile(rf"\s*{pattern}", re.IGNORECASE) + for match in tok_regex.finditer(s): + typ = match.lastgroup + yield typ, match.group(typ) + + def _check_token(typ, token, expected): + if typ is None: + raise ValueError("Expected {} token missing".format(" or ".join(expected))) + else: + if typ not in expected: + raise ValueError(f'Unexpected token "{token}"') + + def _expect(tokens): + typ, token = next(tokens, (None, None)) + _check_token(typ, token, ["COORD"]) + start = parse_humanized(token) + + typ, token = next(tokens, (None, None)) + _check_token(typ, token, ["HYPHEN"]) + + typ, token = next(tokens, (None, None)) + if typ is None: + return start, None + + _check_token(typ, token, ["COORD"]) + end = parse_humanized(token) + if end < start: + raise ValueError("End coordinate less than start") + + return start, end + + parts = s.split(":") + chrom = parts[0].strip() + if not len(chrom): + raise ValueError("Chromosome name cannot be empty") + if len(parts) < 2: + return (chrom, None, None) + start, end = _expect(_tokenize(parts[1])) + return (chrom, start, end) + + +def parse_region( + reg: GenomicRangeSpecifier, + chromsizes: dict | pd.Series | None = None +) -> GenomicRangeTuple: + """ + Genomic regions are represented as half-open intervals (0-based starts, + 1-based ends) along the length coordinate of a contig/scaffold/chromosome. + + Parameters + ---------- + reg : str or tuple + UCSC-style genomic region string, or + Triple (chrom, start, end), where ``start`` or ``end`` may be ``None``. + chromsizes : mapping, optional + Lookup table of scaffold lengths to check against ``chrom`` and the + ``end`` coordinate. Required if ``end`` is not supplied. + + Returns + ------- + A well-formed genomic region triple (str, int, int) + + """ + if isinstance(reg, str): + chrom, start, end = parse_region_string(reg) + else: + chrom, start, end = reg + start = int(start) if start is not None else start + end = int(end) if end is not None else end + + try: + clen = chromsizes[chrom] if chromsizes is not None else None + except KeyError as e: + raise ValueError(f"Unknown sequence label: {chrom}") from e + + start = 0 if start is None else start + if end is None: + if clen is None: # TODO --- remove? + raise ValueError("Cannot determine end coordinate.") + end = clen + + if end < start: + raise ValueError("End cannot be less than start") + + if start < 0 or (clen is not None and end > clen): + raise ValueError(f"Genomic region out of bounds: [{start}, {end})") + + return chrom, start, end + + +def natsort_key(s: str, _NS_REGEX=re.compile(r"(\d+)", re.U)) -> tuple: + return tuple([int(x) if x.isdigit() else x for x in _NS_REGEX.split(s) if x]) + + +def natsorted(iterable: Iterable[str]) -> list[str]: + return sorted(iterable, key=natsort_key) + + +def argnatsort(array: Iterable[str]) -> np.ndarray: + array = np.asarray(array) + if not len(array): + return np.array([], dtype=int) + cols = tuple(zip(*(natsort_key(x) for x in array))) + return np.lexsort(cols[::-1]) + +def read_chromsizes( + filepath_or: str | io.StringIO, + name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), + all_names: bool = False, + verbose: bool = False, # Optional parameter to enable verbose output + **kwargs, +) -> pd.Series: + """ + Parse a `.chrom.sizes` or `.chromInfo.txt` file from the UCSC + database, where `db` is a genome assembly name. + + Parameters + ---------- + filepath_or : str or file-like + Path or url to text file, or buffer. + name_patterns : sequence, optional + Sequence of regular expressions to capture desired sequence names. + all_names : bool, optional + Whether to return all contigs listed in the file. + verbose : bool, optional + Whether to enable verbose logging for diagnostics. + """ + # Check if the input is a file-like object (StringIO or file path) and + # inspect the first line for delimiters + if isinstance(filepath_or, (str, io.StringIO)): + first_line = None + if isinstance(filepath_or, io.StringIO): + first_line = filepath_or.getvalue().splitlines()[0] + elif isinstance(filepath_or, str): + with open(filepath_or) as file: + first_line = file.readline() + + if first_line and ' ' in first_line: + raise ValueError( + f"Chromsizes file '{filepath_or}' uses spaces instead of tabs " + "as delimiters. Please use tabs.") + + # Read the chromosome size file into a DataFrame + if verbose: + print(f"Reading chromsizes file: {filepath_or}") + + chromtable = pd.read_csv( + filepath_or, + sep="\t", # Ensuring tab is the delimiter + usecols=[0, 1], + names=["name", "length"], + dtype={"name": str}, + **kwargs, + ) + + + # Convert the "length" column to numeric values, coercing errors to NaN + chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") + + # Check for NaN values after reading the file + if chromtable["length"].isnull().any(): + invalid_rows = chromtable[chromtable["length"].isnull()] + if verbose: + print(f"Invalid rows detected: {invalid_rows}") + raise ValueError( + f"Chromsizes file '{filepath_or}' contains missing or invalid " + "length values. Please ensure that the file is properly formatted " + "as tab-delimited with two columns: sequence name and integer " + "length. Check for extraneous spaces or hidden characters. " + "Invalid rows: \n{invalid_rows}" + ) + + # Filter by patterns if needed + if not all_names: + parts = [] + for pattern in name_patterns: + part = chromtable[chromtable["name"].str.contains(pattern)] + part = part.iloc[argnatsort(part["name"])] + parts.append(part) + chromtable = pd.concat(parts, axis=0) + + chromtable.index = chromtable["name"].values + return chromtable["length"] + + +def fetch_chromsizes(db: str, **kwargs) -> pd.Series: + """ + Download chromosome sizes from UCSC as a :py:class:`pandas.Series`, indexed + by chromosome label. + + """ + return read_chromsizes( + f"http://hgdownload.soe.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz", + **kwargs, + ) + + +def load_fasta(names: list[str], *filepaths: str) -> OrderedDict[str, Any]: + """ + Load lazy FASTA records from one or multiple files without reading them + into memory. + + Parameters + ---------- + names : sequence of str + Names of sequence records in FASTA file or files. + filepaths : str + Paths to one or more FASTA files to gather records from. + + Returns + ------- + OrderedDict of sequence name -> sequence record + + """ + import pyfaidx + + if len(filepaths) == 0: + raise ValueError("Need at least one file") + + if len(filepaths) == 1: + fa = pyfaidx.Fasta(filepaths[0], as_raw=True) + + else: + fa = {} + for filepath in filepaths: + fa.update(pyfaidx.Fasta(filepath, as_raw=True).records) + + records = OrderedDict((chrom, fa[chrom]) for chrom in names) + return records + + +def binnify(chromsizes: pd.Series, binsize: int) -> pd.DataFrame: + """ + Divide a genome into evenly sized bins. + + Parameters + ---------- + chromsizes : Series + pandas Series indexed by chromosome name with chromosome lengths in bp. + binsize : int + size of bins in bp + + Returns + ------- + bins : :py:class:`pandas.DataFrame` + Dataframe with columns: ``chrom``, ``start``, ``end``. + + """ + + def _each(chrom): + clen = chromsizes[chrom] + n_bins = int(np.ceil(clen / binsize)) + binedges = np.arange(0, (n_bins + 1)) * binsize + binedges[-1] = clen + return pd.DataFrame( + {"chrom": [chrom] * n_bins, "start": binedges[:-1], "end": binedges[1:]}, + columns=["chrom", "start", "end"], + ) + + bintable = pd.concat(map(_each, chromsizes.keys()), axis=0, ignore_index=True) + + bintable["chrom"] = pd.Categorical( + bintable["chrom"], categories=list(chromsizes.index), ordered=True + ) + + return bintable + + +make_bintable = binnify + + +def digest(fasta_records: OrderedDict[str, Any], enzyme: str) -> pd.DataFrame: + """ + Divide a genome into restriction fragments. + + Parameters + ---------- + fasta_records : OrderedDict + Dictionary of chromosome names to sequence records. + enzyme: str + Name of restriction enzyme (e.g., 'DpnII'). + + Returns + ------- + frags : :py:class:`pandas.DataFrame` + Dataframe with columns: ``chrom``, ``start``, ``end``. + + """ + try: + import Bio.Restriction as biorst + import Bio.Seq as bioseq + except ImportError: + raise ImportError( + "Biopython is required to find restriction fragments." + ) from None + + # http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698 + chroms = fasta_records.keys() + try: + cut_finder = getattr(biorst, enzyme).search + except AttributeError as e: + raise ValueError(f"Unknown enzyme name: {enzyme}") from e + + def _each(chrom): + seq = bioseq.Seq(str(fasta_records[chrom][:])) + cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(np.int64) + n_frags = len(cuts) - 1 + + frags = pd.DataFrame( + {"chrom": [chrom] * n_frags, "start": cuts[:-1], "end": cuts[1:]}, + columns=["chrom", "start", "end"], + ) + return frags + + return pd.concat(map(_each, chroms), axis=0, ignore_index=True) + + +def get_binsize(bins: pd.DataFrame) -> int | None: + """ + Infer bin size from a bin DataFrame. Assumes that the last bin of each + contig is allowed to differ in size from the rest. + + Returns + ------- + int or None if bins are non-uniform + + """ + sizes = set() + for _chrom, group in bins.groupby("chrom", observed=True): + sizes.update((group["end"] - group["start"]).iloc[:-1].unique()) + if len(sizes) > 1: + return None + if len(sizes) == 1: + return next(iter(sizes)) + else: + return None + + +def get_chromsizes(bins: pd.DataFrame) -> pd.Series: + """ + Infer chromsizes Series from a bin DataFrame. Assumes that the last bin of + each contig is allowed to differ in size from the rest. + + Returns + ------- + int or None if bins are non-uniform + + """ + chromtable = ( + bins.drop_duplicates(["chrom"], keep="last")[["chrom", "end"]] + .reset_index(drop=True) + .rename(columns={"chrom": "name", "end": "length"}) + ) + chroms, lengths = list(chromtable["name"]), list(chromtable["length"]) + return pd.Series(index=chroms, data=lengths) + + +def bedslice( + grouped, + chromsizes: pd.Series | dict, + region: GenomicRangeSpecifier, +) -> pd.DataFrame: + """ + Range query on a BED-like dataframe with non-overlapping intervals. + + """ + chrom, start, end = parse_region(region, chromsizes) + result = grouped.get_group(chrom) + if start > 0 or end < chromsizes[chrom]: + lo = result["end"].values.searchsorted(start, side="right") + hi = lo + result["start"].values[lo:].searchsorted(end, side="left") + result = result.iloc[lo:hi] + return result + + +def asarray_or_dataset(x: Any) -> np.ndarray | h5py.Dataset: + return x if isinstance(x, h5py.Dataset) else np.asarray(x) + + +def rlencode( + array: np.ndarray, + chunksize: int | None = None +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Run length encoding. + Based on http://stackoverflow.com/a/32681075, which is based on the rle + function from R. + + Parameters + ---------- + x : 1D array_like + Input array to encode + dropna: bool, optional + Drop all runs of NaNs. + + Returns + ------- + start positions, run lengths, run values + + """ + where = np.flatnonzero + array = asarray_or_dataset(array) + n = len(array) + if n == 0: + return ( + np.array([], dtype=int), + np.array([], dtype=int), + np.array([], dtype=array.dtype), + ) + + if chunksize is None: + chunksize = n + + starts, values = [], [] + last_val = np.nan + for i in range(0, n, chunksize): + x = array[i : i + chunksize] + locs = where(x[1:] != x[:-1]) + 1 + if x[0] != last_val: + locs = np.r_[0, locs] + starts.append(i + locs) + values.append(x[locs]) + last_val = x[-1] + starts = np.concatenate(starts) + lengths = np.diff(np.r_[starts, n]) + values = np.concatenate(values) + + return starts, lengths, values + + +def cmd_exists(cmd: str) -> bool: + return any( + os.access(os.path.join(path, cmd), os.X_OK) + for path in os.environ["PATH"].split(os.pathsep) + ) + + +def mad(data: np.ndarray, axis: int | None = None) -> np.ndarray: + return np.median(np.abs(data - np.median(data, axis)), axis) + + +@contextmanager +def open_hdf5( + fp: str | h5py.Group, + mode: str = "r", + *args, + **kwargs +) -> Generator[h5py.Group, None, None]: + """ + Context manager like ``h5py.File`` but accepts already open HDF5 file + handles which do not get closed on teardown. + + Parameters + ---------- + fp : str or ``h5py.File`` object + If an open file object is provided, it passes through unchanged, + provided that the requested mode is compatible. + If a filepath is passed, the context manager will close the file on + tear down. + + mode : str + * r Readonly, file must exist + * r+ Read/write, file must exist + * a Read/write if exists, create otherwise + * w Truncate if exists, create otherwise + * w- or x Fail if exists, create otherwise + + """ + if isinstance(fp, str): + own_fh = True + fh = h5py.File(fp, mode, *args, **kwargs) + else: + own_fh = False + if mode == "r" and fp.file.mode == "r+": + # warnings.warn("File object provided is writeable but intent is read-only") + pass + elif mode in ("r+", "a") and fp.file.mode == "r": + raise ValueError("File object provided is not writeable") + elif mode == "w": + raise ValueError("Cannot truncate open file") + elif mode in ("w-", "x"): + raise ValueError("File exists") + fh = fp + try: + yield fh + finally: + if own_fh: + fh.close() + + +class closing_hdf5(h5py.Group): + def __init__(self, grp: h5py.Group): + super().__init__(grp.id) + + def __enter__(self) -> h5py.Group: + return self + + def __exit__(self, *exc_info) -> None: + return self.file.close() + + def close(self) -> None: + self.file.close() + + +def attrs_to_jsonable(attrs: h5py.AttributeManager) -> dict: + out = dict(attrs) + for k, v in attrs.items(): + try: + out[k] = v.item() + except ValueError: + out[k] = v.tolist() + except AttributeError: + out[k] = v + return out + + +def infer_meta(x, index=None): # pragma: no cover + """ + Extracted and modified from dask/dataframe/utils.py : + make_meta (BSD licensed) + + Create an empty pandas object containing the desired metadata. + + Parameters + ---------- + x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar + To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or + an iterable of `(name, dtype)` tuples. To create a `Series`, provide a + tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index + should match the desired output. If a dtype or scalar, a scalar of the + same dtype is returned. + index : pd.Index, optional + Any pandas index to use in the metadata. If none provided, a + `RangeIndex` will be used. + + Examples + -------- + >>> make_meta([('a', 'i8'), ('b', 'O')]) + Empty DataFrame + Columns: [a, b] + Index: [] + >>> make_meta(('a', 'f8')) + Series([], Name: a, dtype: float64) + >>> make_meta('i8') + 1 + + """ + + _simple_fake_mapping = { + "b": np.bool_(True), + "V": np.void(b" "), + "M": np.datetime64("1970-01-01"), + "m": np.timedelta64(1), + "S": np.str_("foo"), + "a": np.str_("foo"), + "U": np.str_("foo"), + "O": "foo", + } + + UNKNOWN_CATEGORIES = "__UNKNOWN_CATEGORIES__" + + def _scalar_from_dtype(dtype): + if dtype.kind in ("i", "f", "u"): + return dtype.type(1) + elif dtype.kind == "c": + return dtype.type(complex(1, 0)) + elif dtype.kind in _simple_fake_mapping: + o = _simple_fake_mapping[dtype.kind] + return o.astype(dtype) if dtype.kind in ("m", "M") else o + else: + raise TypeError(f"Can't handle dtype: {dtype}") + + def _nonempty_scalar(x): + if isinstance(x, (pd.Timestamp, pd.Timedelta, pd.Period)): + return x + elif np.isscalar(x): + dtype = x.dtype if hasattr(x, "dtype") else np.dtype(type(x)) + return _scalar_from_dtype(dtype) + else: + raise TypeError("Can't handle meta of type " f"'{type(x).__name__}'") + + def _empty_series(name, dtype, index=None): + if isinstance(dtype, str) and dtype == "category": + return pd.Series( + pd.Categorical([UNKNOWN_CATEGORIES]), name=name, index=index + ).iloc[:0] + return pd.Series([], dtype=dtype, name=name, index=index) + + if hasattr(x, "_meta"): + return x._meta + if isinstance(x, (pd.Series, pd.DataFrame)): + return x.iloc[0:0] + elif isinstance(x, pd.Index): + return x[0:0] + index = index if index is None else index[0:0] + + if isinstance(x, dict): + return pd.DataFrame( + {c: _empty_series(c, d, index=index) for (c, d) in x.items()}, index=index + ) + if isinstance(x, tuple) and len(x) == 2: + return _empty_series(x[0], x[1], index=index) + elif isinstance(x, (list, tuple)): + if not all(isinstance(i, tuple) and len(i) == 2 for i in x): + raise ValueError( + "Expected iterable of tuples of (name, dtype), " f"got {x}" + ) + return pd.DataFrame( + {c: _empty_series(c, d, index=index) for (c, d) in x}, + columns=[c for c, d in x], + index=index, + ) + elif not hasattr(x, "dtype") and x is not None: + # could be a string, a dtype object, or a python type. Skip `None`, + # because it is implictly converted to `dtype('f8')`, which we don't + # want here. + try: + dtype = np.dtype(x) + return _scalar_from_dtype(dtype) + except: # noqa + # Continue on to next check + pass + + if is_scalar(x): + return _nonempty_scalar(x) + + raise TypeError(f"Don't know how to create metadata from {x}") + + +def get_meta( + columns, dtype=None, index_columns=None, index_names=None, default_dtype=np.object_ +): # pragma: no cover + """ + Extracted and modified from pandas/io/parsers.py : + _get_empty_meta (BSD licensed). + + """ + columns = list(columns) + + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not isinstance(dtype, dict): + # if dtype == None, default will be default_dtype. + dtype = defaultdict(lambda: dtype or default_dtype) + else: + # Save a copy of the dictionary. + _dtype = dtype.copy() + dtype = defaultdict(lambda: default_dtype) + + # Convert column indexes to column names. + for k, v in _dtype.items(): + col = columns[k] if is_integer(k) else k + dtype[col] = v + + if index_columns is None or index_columns is False: + index = pd.Index([]) + else: + data = [pd.Series([], dtype=dtype[name]) for name in index_names] + if len(data) == 1: + index = pd.Index(data[0], name=index_names[0]) + else: + index = pd.MultiIndex.from_arrays(data, names=index_names) + index_columns.sort() + for i, n in enumerate(index_columns): + columns.pop(n - i) + + col_dict = {col_name: pd.Series([], dtype=dtype[col_name]) for col_name in columns} + + return pd.DataFrame(col_dict, columns=columns, index=index) + + +def check_bins(bins: pd.DataFrame, chromsizes: pd.Series) -> pd.DataFrame: + is_cat = isinstance(bins["chrom"].dtype, pd.CategoricalDtype) + bins = bins.copy() + if not is_cat: + bins["chrom"] = pd.Categorical( + bins.chrom, categories=list(chromsizes.index), ordered=True + ) + else: + assert (bins["chrom"].cat.categories == chromsizes.index).all() + + return bins + + +def balanced_partition( + gs: GenomeSegmentation, + n_chunk_max: int, + file_contigs: list[str], + loadings: list[int | float] | None = None +) -> list[GenomicRangeTuple]: + # n_bins = len(gs.bins) + grouped = gs._bins_grouped + + chrom_nbins = grouped.size() + if loadings is None: + loadings = chrom_nbins + chrmax = loadings.idxmax() + loadings = loadings / loadings.loc[chrmax] + const = chrom_nbins.loc[chrmax] / n_chunk_max + + granges = [] + for chrom, group in grouped: + if chrom not in file_contigs: + continue + clen = gs.chromsizes[chrom] + step = int(np.ceil(const / loadings.loc[chrom])) + anchors = group.start.values[::step] + if anchors[-1] != clen: + anchors = np.r_[anchors, clen] + granges.extend( + (chrom, start, end) for start, end in zip(anchors[:-1], anchors[1:]) + ) + return granges + + +class GenomeSegmentation: + def __init__(self, chromsizes: pd.Series, bins: pd.DataFrame): + bins = check_bins(bins, chromsizes) + self._bins_grouped = bins.groupby("chrom", observed=True, sort=False) + nbins_per_chrom = self._bins_grouped.size().values + + self.chromsizes = chromsizes + self.binsize = get_binsize(bins) + self.contigs = list(chromsizes.keys()) + self.bins = bins + self.idmap = pd.Series(index=chromsizes.keys(), data=range(len(chromsizes))) + self.chrom_binoffset = np.r_[0, np.cumsum(nbins_per_chrom)] + self.chrom_abspos = np.r_[0, np.cumsum(chromsizes.values)] + self.start_abspos = ( + self.chrom_abspos[bins["chrom"].cat.codes] + bins["start"].values + ) + + def fetch(self, region: GenomicRangeSpecifier) -> pd.DataFrame: + chrom, start, end = parse_region(region, self.chromsizes) + result = self._bins_grouped.get_group(chrom) + if start > 0 or end < self.chromsizes[chrom]: + lo = result["end"].values.searchsorted(start, side="right") + hi = lo + result["start"].values[lo:].searchsorted(end, side="left") + result = result.iloc[lo:hi] + return result + + +def buffered( + chunks: Iterable[pd.DataFrame], + size: int = 10000000 +) -> Iterator[pd.DataFrame]: + """ + Take an incoming iterator of small data frame chunks and buffer them into + an outgoing iterator of larger chunks. + + Parameters + ---------- + chunks : iterator of :py:class:`pandas.DataFrame` + Each chunk should have the same column names. + size : int + Minimum length of output chunks. + + Yields + ------ + Larger outgoing :py:class:`pandas.DataFrame` chunks made from concatenating + the incoming ones. + + """ + buf = [] + n = 0 + for chunk in chunks: + n += len(chunk) + buf.append(chunk) + if n > size: + yield pd.concat(buf, axis=0) + buf = [] + n = 0 + if len(buf): + yield pd.concat(buf, axis=0) From 77cd6a1d0fda25925050ccc7eb6dfb9b77b3f3b1 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Tue, 4 Mar 2025 13:58:36 +0530 Subject: [PATCH 07/16] Update src/cooler/util.py Co-authored-by: Nezar Abdennur --- src/cooler/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index 44fb853d..20f72082 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -248,7 +248,7 @@ def read_chromsizes( chromtable = pd.read_csv( filepath_or, - sep="\t", # Ensuring tab is the delimiter + sep="\t", usecols=[0, 1], names=["name", "length"], dtype={"name": str}, From 7ea498dc87612e5d6d97568f674aae36014a5f7d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Mar 2025 08:28:44 +0000 Subject: [PATCH 08/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/cooler/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index 20f72082..12b4e4bb 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -248,7 +248,7 @@ def read_chromsizes( chromtable = pd.read_csv( filepath_or, - sep="\t", + sep="\t", usecols=[0, 1], names=["name", "length"], dtype={"name": str}, From ffa8363333c9ca9a6dfe684938c3a6d95daf04f3 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 5 Mar 2025 13:06:12 +0530 Subject: [PATCH 09/16] Removed verbose and added pandas built in on_bad_lines in def chromsizes. --- src/cooler/util.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index 12b4e4bb..cb933250 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -209,7 +209,6 @@ def read_chromsizes( filepath_or: str | io.StringIO, name_patterns: tuple[str, ...] = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), all_names: bool = False, - verbose: bool = False, # Optional parameter to enable verbose output **kwargs, ) -> pd.Series: """ @@ -242,16 +241,16 @@ def read_chromsizes( f"Chromsizes file '{filepath_or}' uses spaces instead of tabs " "as delimiters. Please use tabs.") - # Read the chromosome size file into a DataFrame - if verbose: - print(f"Reading chromsizes file: {filepath_or}") - + # Read the chromosome size file into a DataFrame + # on_bad_lines="error" will raise an error if any row does not have exactly two + # columns. chromtable = pd.read_csv( filepath_or, - sep="\t", + sep="\t", # Ensuring tab is the delimiter usecols=[0, 1], names=["name", "length"], dtype={"name": str}, + on_bad_lines="error", **kwargs, ) @@ -259,17 +258,16 @@ def read_chromsizes( # Convert the "length" column to numeric values, coercing errors to NaN chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") - # Check for NaN values after reading the file + # Check for NaN values after conversion and raise an error if any are found. if chromtable["length"].isnull().any(): - invalid_rows = chromtable[chromtable["length"].isnull()] - if verbose: - print(f"Invalid rows detected: {invalid_rows}") raise ValueError( - f"Chromsizes file '{filepath_or}' contains missing or invalid " - "length values. Please ensure that the file is properly formatted " - "as tab-delimited with two columns: sequence name and integer " - "length. Check for extraneous spaces or hidden characters. " - "Invalid rows: \n{invalid_rows}" + f"Chromsizes file '{filepath_or}' contains missing or invalid length " + "values. " + "Please ensure that the file is properly formatted as tab-delimited with " + "two columns: " + "sequence name and integer length. " + "Check for extraneous spaces or hidden characters. " + f"Invalid rows: \n{chromtable[chromtable['length'].isnull()]}" ) # Filter by patterns if needed From 07841b5711a03892501b7d5384051c1a1092bcd4 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 12 Mar 2025 19:02:27 +0530 Subject: [PATCH 10/16] Add column validation to cload pairs (#135) - Added validate_pairs_columns to check column indices against file content. - Updated get_header to use readline() instead of peek() for test compatibility. - Modified pairs to use validate_pairs_columns stream, added kwargs = {}, removed duplicate call. --- src/cooler/cli/cload.py | 1341 ++++++++++++++++++++------------------- 1 file changed, 675 insertions(+), 666 deletions(-) diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py index ded4b192..1f77ab1a 100644 --- a/src/cooler/cli/cload.py +++ b/src/cooler/cli/cload.py @@ -1,666 +1,675 @@ -import sys - -import click -import h5py -import numpy as np -import pandas as pd -import simplejson as json -from cytoolz import compose -from multiprocess import Pool - -from ..create import ( - HDF5Aggregator, - PairixAggregator, - TabixAggregator, - aggregate_records, - create_cooler, - sanitize_records, -) -from . import cli, get_logger -from ._util import parse_bins, parse_field_param, parse_kv_list_param - -_pandas_version = pd.__version__.split(".") -if int(_pandas_version[0]) > 0: - from pandas.io.common import get_handle - - -# Copied from pairtools._headerops -def get_header(instream, comment_char="#"): - """Returns a header from the stream and an the reaminder of the stream - with the actual data. - Parameters - ---------- - instream : a file object - An input stream. - comment_char : str - The character prepended to header lines (use '@' when parsing sams, - '#' when parsing pairsams). - Returns - ------- - header : list - The header lines, stripped of terminal spaces and newline characters. - remainder_stream : stream/file-like object - Stream with the remaining lines. - - """ - header = [] - if not comment_char: - raise ValueError("Please, provide a comment char!") - comment_byte = comment_char.encode() - # get peekable buffer for the instream - read_f, peek_f = None, None - if hasattr(instream, "buffer"): - peek_f = instream.buffer.peek - readline_f = instream.buffer.readline - elif hasattr(instream, "peek"): - peek_f = instream.peek - readline_f = instream.readline - else: - raise ValueError("Cannot find the peek() function of the provided stream!") - - current_peek = peek_f(1) - while current_peek.startswith(comment_byte): - # consuming a line from buffer guarantees - # that the remainder of the buffer starts - # with the beginning of the line. - line = readline_f() - if isinstance(line, bytes): - line = line.decode() - # append line to header, since it does start with header - header.append(line.strip()) - # peek into the remainder of the instream - current_peek = peek_f(1) - # apparently, next line does not start with the comment - # return header and the instream, advanced to the beginning of the data - return header, instream - - -@cli.group() -def cload(): - """ - Create a cooler from genomic pairs and bins. - - Choose a subcommand based on the format of the input contact list. - - """ - pass - - -# flake8: noqa -def register_subcommand(func): - return cload.command()( - click.argument("bins", type=str, metavar="BINS")( - click.argument( - "pairs_path", - type=click.Path(exists=True, allow_dash=True), - metavar="PAIRS_PATH", - )( - click.argument("cool_path", metavar="COOL_PATH")( - click.option( - "--metadata", help="Path to JSON file containing user metadata." - )( - click.option( - "--assembly", - help="Name of genome assembly (e.g. hg19, mm10)", - )(func) - ) - ) - ) - ) - ) - - -def add_arg_help(func): - func.__doc__ = func.__doc__.format( - """ - BINS : One of the following - - : 1. Path to a chromsizes file, 2. Bin size in bp - - : Path to BED file defining the genomic bin segmentation. - - PAIRS_PATH : Path to contacts (i.e. read pairs) file. - - COOL_PATH : Output COOL file path or URI.""" - ) - return func - - -@register_subcommand -@add_arg_help -@click.option( - "--chunksize", - "-c", - help="Control the number of pixels handled by each worker process at a time.", - type=int, - default=int(100e6), - show_default=True, -) -def hiclib(bins, pairs_path, cool_path, metadata, assembly, chunksize): - """ - Bin a hiclib HDF5 contact list (frag) file. - - {} - - hiclib on BitBucket: . - - """ - - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - with h5py.File(pairs_path, "r") as h5pairs: - iterator = HDF5Aggregator(h5pairs, chromsizes, bins, chunksize) - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - - -@register_subcommand -@add_arg_help -@click.option( - "--nproc", - "-p", - help="Number of processes to split the work between.", - type=int, - default=8, - show_default=True, -) -@click.option( - "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, default=4 -) -@click.option( - "--pos2", "-p2", help="pos2 field number (one-based)", type=int, default=5 -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--max-split", - "-s", - help="Divide the pairs from each chromosome into at most this many chunks. " - "Smaller chromosomes will be split less frequently or not at all. " - "Increase ths value if large chromosomes dominate the workload on " - "multiple processors.", - type=int, - default=2, - show_default=True, -) -def tabix( - bins, - pairs_path, - cool_path, - metadata, - assembly, - nproc, - zero_based, - max_split, - **kwargs, -): - """ - Bin a tabix-indexed contact list file. - - {} - - See also: 'cooler csort' to sort and index a contact list file - - Tabix manpage: . - - """ - logger = get_logger(__name__) - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - try: - map_func = map - if nproc > 1: - pool = Pool(nproc) - logger.info(f"Using {nproc} cores") - map_func = pool.imap - - opts = {} - if "chrom2" in kwargs: - opts["C2"] = kwargs["chrom2"] - 1 - if "pos2" in kwargs: - opts["P2"] = kwargs["pos2"] - 1 - - iterator = TabixAggregator( - pairs_path, - chromsizes, - bins, - map=map_func, - is_one_based=(not zero_based), - n_chunks=max_split, - **opts, - ) - - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - finally: - if nproc > 1: - pool.close() - - -@register_subcommand -@add_arg_help -@click.option( - "--nproc", - "-p", - help="Number of processes to split the work between.", - type=int, - default=8, - show_default=True, -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--max-split", - "-s", - help="Divide the pairs from each chromosome into at most this many chunks. " - "Smaller chromosomes will be split less frequently or not at all. " - "Increase ths value if large chromosomes dominate the workload on " - "multiple processors.", - type=int, - default=2, - show_default=True, -) -@click.option( - "--block-char", - help="Character separating contig names in the block names of the pairix " - "index.", - type=str, - default="|", - show_default=True, -) -def pairix( - bins, - pairs_path, - cool_path, - metadata, - assembly, - nproc, - zero_based, - max_split, - block_char, -): - """ - Bin a pairix-indexed contact list file. - - {} - - See also: 'cooler csort' to sort and index a contact list file - - Pairix on GitHub: . - - """ - logger = get_logger(__name__) - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - try: - map_func = map - if nproc > 1: - pool = Pool(nproc) - logger.info(f"Using {nproc} cores") - map_func = pool.imap - - iterator = PairixAggregator( - pairs_path, - chromsizes, - bins, - map=map_func, - is_one_based=(not zero_based), - n_chunks=max_split, - block_char=block_char, - ) - - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - finally: - if nproc > 1: - pool.close() - - -@register_subcommand -@add_arg_help -@click.option( - "--chrom1", "-c1", help="chrom1 field number (one-based)", type=int, required=True -) -@click.option( - "--pos1", "-p1", help="pos1 field number (one-based)", type=int, required=True -) -@click.option( - "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, required=True -) -@click.option( - "--pos2", "-p2", help="pos2 field number (one-based)", type=int, required=True -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--comment-char", - type=str, - default="#", - show_default=True, - help="Comment character that indicates lines to ignore.", -) -@click.option( - "--no-symmetric-upper", - "-N", - help="Create a complete square matrix without implicit symmetry. " - "This allows for distinct upper- and lower-triangle values", - is_flag=True, - default=False, -) -@click.option( - "--input-copy-status", - type=click.Choice(["unique", "duplex"]), - default="unique", - help="Copy status of input data when using symmetric-upper storage. | " - "`unique`: Incoming data comes from a unique half of a symmetric " - "map, regardless of how the coordinates of a pair are ordered. " - "`duplex`: Incoming data contains upper- and lower-triangle duplicates. " - "All input records that map to the lower triangle will be discarded! | " - "If you wish to treat lower- and upper-triangle input data as " - "distinct, use the ``--no-symmetric-upper`` option. ", - show_default=True, -) -@click.option( - "--field", - help="Specify quantitative input fields to aggregate into value columns " - "using the syntax ``--field =``. " - "Optionally, append ``:`` followed by ``dtype=`` to specify " - "the data type (e.g. float), and/or ``agg=`` to " - "specify an aggregation function different from sum (e.g. mean). " - "Field numbers are 1-based. Passing 'count' as the target name will " - "override the default behavior of storing pair counts. " - "Repeat the ``--field`` option for each additional field.", - type=str, - multiple=True, -) -# @click.option( -# "--no-count", -# help="Do not store the pair counts. Use this only if you use `--field` to " -# "specify at least one input field for aggregation as an alternative.", -# is_flag=True, -# default=False) -@click.option( - "--chunksize", - "-c", - help="Size in number of lines/records of data chunks to read and process " - "from the input stream at a time. These chunks will be saved as temporary " - "partial coolers and then merged.", - type=int, - default=15_000_000, -) -@click.option( - "--mergebuf", - help="Total number of pixel records to buffer per epoch of merging data. " - "Defaults to the same value as `chunksize`.", - type=int, -) -@click.option( - "--max-merge", - help="Maximum number of chunks to merge in a single pass.", - type=int, - default=200, - show_default=True, -) -@click.option( - "--temp-dir", - help="Create temporary files in a specified directory. Pass ``-`` to use " - "the platform default temp dir.", - type=click.Path(exists=True, file_okay=False, dir_okay=True, allow_dash=True), -) -@click.option( - "--no-delete-temp", - help="Do not delete temporary files when finished.", - is_flag=True, - default=False, -) -@click.option( - "--storage-options", - help="Options to modify the data filter pipeline. Provide as a " - "comma-separated list of key-value pairs of the form 'k1=v1,k2=v2,...'. " - "See http://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline " - "for more details.", -) -@click.option( - "--append", - "-a", - is_flag=True, - default=False, - help="Pass this flag to append the output cooler to an existing file " - "instead of overwriting the file.", -) -# @click.option( -# "--format", "-f", -# help="Preset data format.", -# type=click.Choice(['4DN', 'BEDPE'])) -# --sep -def pairs( - bins, - pairs_path, - cool_path, - metadata, - assembly, - zero_based, - comment_char, - input_copy_status, - no_symmetric_upper, - field, - chunksize, - mergebuf, - temp_dir, - no_delete_temp, - max_merge, - storage_options, - append, - **kwargs, -): - """ - Bin any text file or stream of pairs. - - Pairs data need not be sorted. Accepts compressed files. - To pipe input from stdin, set PAIRS_PATH to '-'. - - {} - - """ - chromsizes, bins = parse_bins(bins) - if mergebuf is None: - mergebuf = chunksize - - symmetric_upper = not no_symmetric_upper - tril_action = None - if symmetric_upper: - if input_copy_status == "unique": - tril_action = "reflect" - elif input_copy_status == "duplex": - tril_action = "drop" - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - input_field_names = [ - "chrom1", - "pos1", - "chrom2", - "pos2", - ] - input_field_dtypes = { - "chrom1": str, - "pos1": np.int64, - "chrom2": str, - "pos2": np.int64, - } - input_field_numbers = {} - for name in ["chrom1", "pos1", "chrom2", "pos2"]: - if kwargs[name] == 0: - raise click.BadParameter("Field numbers start at 1", param_hint=name) - input_field_numbers[name] = kwargs[name] - 1 - - # Include input value columns - output_field_names = [] - output_field_dtypes = {} - aggregations = {} - if len(field): - for arg in field: - name, colnum, dtype, agg = parse_field_param(arg) - - # Special cases: these do not have input fields. - # Omit field number and agg to change standard dtypes. - if colnum is None: - if ( - agg is None - and dtype is not None - and name in {"bin1_id", "bin2_id", "count"} - ): - output_field_dtypes[name] = dtype - continue - else: - raise click.BadParameter( - "A field number is required.", param_hint=arg - ) - - if name not in input_field_names: - input_field_names.append(name) - - if name not in output_field_names: - output_field_names.append(name) - - input_field_numbers[name] = colnum - - if dtype is not None: - input_field_dtypes[name] = dtype - output_field_dtypes[name] = dtype - - if agg is not None: - aggregations[name] = agg - else: - aggregations[name] = "sum" - - # # Pairs counts are always produced, unless supressed explicitly - # do_count = not no_count - # if do_count: - # if 'count' not in output_field_names: - # output_field_names.append('count') # default dtype and agg - # else: - # if not len(output_field_names): - # click.BadParameter( - # "To pass `--no-count`, specify at least one input " - # "value-column using `--field`.") - if "count" not in output_field_names: - output_field_names.append("count") - - # Customize the HDF5 filters - if storage_options is not None: - h5opts = parse_kv_list_param(storage_options) - for key in h5opts: - if isinstance(h5opts[key], list): - h5opts[key] = tuple(h5opts[key]) - else: - h5opts = None - - # Initialize the input stream - # TODO: we could save the header into metadata - kwargs = {} - if pairs_path == "-": - f_in = sys.stdin - elif int(_pandas_version[0]) == 1 and int(_pandas_version[1]) < 2: - # get_handle returns a pair of objects in pandas 1.0 and 1.1 - f_in = get_handle(pairs_path, mode="r", compression="infer")[0] - else: - # get_handle returns a single wrapper object in pandas 1.2+ and 2.* - f_in = get_handle(pairs_path, mode="r", compression="infer").handle - - _, f_in = get_header(f_in) - - reader = pd.read_csv( - f_in, - sep="\t", - usecols=[input_field_numbers[name] for name in input_field_names], - names=input_field_names, - dtype=input_field_dtypes, - iterator=True, - chunksize=chunksize, - **kwargs, - ) - - sanitize = sanitize_records( - bins, - schema="pairs", - decode_chroms=True, - is_one_based=not zero_based, - tril_action=tril_action, - sort=True, - validate=True, - ) - aggregate = aggregate_records(agg=aggregations, count=True, sort=False) - pipeline = compose(aggregate, sanitize) - - create_cooler( - cool_path, - bins, - map(pipeline, reader), - columns=output_field_names, - dtypes=output_field_dtypes, - metadata=metadata, - assembly=assembly, - mergebuf=mergebuf, - max_merge=max_merge, - temp_dir=temp_dir, - delete_temp=not no_delete_temp, - boundscheck=False, - triucheck=False, - dupcheck=False, - ensure_sorted=False, - symmetric_upper=symmetric_upper, - h5opts=h5opts, - ordered=False, - mode="a" if append else "w", - ) +import sys + +import click +from .. import create_cooler +import h5py +import io +import numpy as np +import pandas as pd +import simplejson as json +from cytoolz import compose +from multiprocess import Pool + +from ..create import ( + HDF5Aggregator, + PairixAggregator, + TabixAggregator, + aggregate_records, + create_cooler, + sanitize_records, +) +from . import cli, get_logger +from ._util import parse_bins, parse_field_param, parse_kv_list_param + +_pandas_version = pd.__version__.split(".") +if int(_pandas_version[0]) > 0: + from pandas.io.common import get_handle + + +def get_header(instream, comment_char="#"): + """Returns a header from the stream and the remainder of the stream + with the actual data. + + Parameters + ---------- + instream : a file object + An input stream. + comment_char : str + The character prepended to header lines (use '@' when parsing sams, + '#' when parsing pairsams). + Returns + ------- + header : list + The header lines, stripped of terminal spaces and newline characters. + remainder_stream : stream/file-like object + Stream with the remaining lines. + """ + if not comment_char: + raise ValueError("Please, provide a comment char!") + + header = [] + # Buffer the input stream to handle non-peekable streams like StringIO + buffered_lines = [] + readline_f = instream.readline + + while True: + line = readline_f() + if not line: # End of stream + break + if isinstance(line, bytes): + line = line.decode() + if not line.startswith(comment_char): + buffered_lines.append(line) + break + header.append(line.strip()) + + # Create a new stream with the remaining data + from io import StringIO + remainder_stream = StringIO("".join(buffered_lines) + instream.read()) if buffered_lines else instream + return header, remainder_stream + +def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> io.StringIO | io.TextIOBase: + """ + Validate that column indices for chrom1, pos1, chrom2, pos2, and any additional fields + do not exceed the number of columns in the pairs file. Returns the stream positioned after headers. + + Args: + file_path: Path to the pairs file or '-' for stdin. + field_numbers: Dictionary mapping field names (e.g., 'chrom1') to zero-based column indices. + is_stdin: True if input is from stdin. + + Returns: + File-like object positioned at the start of data. + + Raises: + ValueError: If any column index exceeds the number of columns in the file. + """ + if is_stdin: + input_data = sys.stdin.read() if hasattr(sys.stdin, 'read') else sys.stdin.getvalue() + f = io.StringIO(input_data) + else: + f = open(file_path, 'r') + + _, f = get_header(f, comment_char="#") + pos = f.tell() + first_data_line = f.readline().strip() + f.seek(pos) # Rewind to preserve the stream + if not first_data_line: + raise ValueError(f"Pairs file '{file_path}' is empty or contains only header lines") + + num_cols = len(first_data_line.split("\t")) + for name, idx in field_numbers.items(): + if idx >= num_cols: + raise ValueError( + f"Column index {idx + 1} ({name}) exceeds number of columns ({num_cols}) in '{file_path}'" + ) + return f + +@cli.group() +def cload(): + """ + Create a cooler from genomic pairs and bins. + + Choose a subcommand based on the format of the input contact list. + + """ + pass + + +# flake8: noqa +def register_subcommand(func): + return cload.command()( + click.argument("bins", type=str, metavar="BINS")( + click.argument( + "pairs_path", + type=click.Path(exists=True, allow_dash=True), + metavar="PAIRS_PATH", + )( + click.argument("cool_path", metavar="COOL_PATH")( + click.option( + "--metadata", help="Path to JSON file containing user metadata." + )( + click.option( + "--assembly", + help="Name of genome assembly (e.g. hg19, mm10)", + )(func) + ) + ) + ) + ) + ) + + +def add_arg_help(func): + func.__doc__ = func.__doc__.format( + """ + BINS : One of the following + + : 1. Path to a chromsizes file, 2. Bin size in bp + + : Path to BED file defining the genomic bin segmentation. + + PAIRS_PATH : Path to contacts (i.e. read pairs) file. + + COOL_PATH : Output COOL file path or URI.""" + ) + return func + + +@register_subcommand +@add_arg_help +@click.option( + "--chunksize", + "-c", + help="Control the number of pixels handled by each worker process at a time.", + type=int, + default=int(100e6), + show_default=True, +) +def hiclib(bins, pairs_path, cool_path, metadata, assembly, chunksize): + """ + Bin a hiclib HDF5 contact list (frag) file. + + {} + + hiclib on BitBucket: . + + """ + + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + with h5py.File(pairs_path, "r") as h5pairs: + iterator = HDF5Aggregator(h5pairs, chromsizes, bins, chunksize) + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + + +@register_subcommand +@add_arg_help +@click.option( + "--nproc", + "-p", + help="Number of processes to split the work between.", + type=int, + default=8, + show_default=True, +) +@click.option( + "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, default=4 +) +@click.option( + "--pos2", "-p2", help="pos2 field number (one-based)", type=int, default=5 +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--max-split", + "-s", + help="Divide the pairs from each chromosome into at most this many chunks. " + "Smaller chromosomes will be split less frequently or not at all. " + "Increase ths value if large chromosomes dominate the workload on " + "multiple processors.", + type=int, + default=2, + show_default=True, +) +def tabix( + bins, + pairs_path, + cool_path, + metadata, + assembly, + nproc, + zero_based, + max_split, + **kwargs, +): + """ + Bin a tabix-indexed contact list file. + + {} + + See also: 'cooler csort' to sort and index a contact list file + + Tabix manpage: . + + """ + logger = get_logger(__name__) + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + try: + map_func = map + if nproc > 1: + pool = Pool(nproc) + logger.info(f"Using {nproc} cores") + map_func = pool.imap + + opts = {} + if "chrom2" in kwargs: + opts["C2"] = kwargs["chrom2"] - 1 + if "pos2" in kwargs: + opts["P2"] = kwargs["pos2"] - 1 + + iterator = TabixAggregator( + pairs_path, + chromsizes, + bins, + map=map_func, + is_one_based=(not zero_based), + n_chunks=max_split, + **opts, + ) + + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + finally: + if nproc > 1: + pool.close() + + +@register_subcommand +@add_arg_help +@click.option( + "--nproc", + "-p", + help="Number of processes to split the work between.", + type=int, + default=8, + show_default=True, +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--max-split", + "-s", + help="Divide the pairs from each chromosome into at most this many chunks. " + "Smaller chromosomes will be split less frequently or not at all. " + "Increase ths value if large chromosomes dominate the workload on " + "multiple processors.", + type=int, + default=2, + show_default=True, +) +@click.option( + "--block-char", + help="Character separating contig names in the block names of the pairix " + "index.", + type=str, + default="|", + show_default=True, +) +def pairix( + bins, + pairs_path, + cool_path, + metadata, + assembly, + nproc, + zero_based, + max_split, + block_char, +): + """ + Bin a pairix-indexed contact list file. + + {} + + See also: 'cooler csort' to sort and index a contact list file + + Pairix on GitHub: . + + """ + logger = get_logger(__name__) + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + try: + map_func = map + if nproc > 1: + pool = Pool(nproc) + logger.info(f"Using {nproc} cores") + map_func = pool.imap + + iterator = PairixAggregator( + pairs_path, + chromsizes, + bins, + map=map_func, + is_one_based=(not zero_based), + n_chunks=max_split, + block_char=block_char, + ) + + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + finally: + if nproc > 1: + pool.close() + + +@register_subcommand +@add_arg_help +@click.option( + "--chrom1", "-c1", help="chrom1 field number (one-based)", type=int, required=True +) +@click.option( + "--pos1", "-p1", help="pos1 field number (one-based)", type=int, required=True +) +@click.option( + "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, required=True +) +@click.option( + "--pos2", "-p2", help="pos2 field number (one-based)", type=int, required=True +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--comment-char", + type=str, + default="#", + show_default=True, + help="Comment character that indicates lines to ignore.", +) +@click.option( + "--no-symmetric-upper", + "-N", + help="Create a complete square matrix without implicit symmetry. " + "This allows for distinct upper- and lower-triangle values", + is_flag=True, + default=False, +) +@click.option( + "--input-copy-status", + type=click.Choice(["unique", "duplex"]), + default="unique", + help="Copy status of input data when using symmetric-upper storage. | " + "`unique`: Incoming data comes from a unique half of a symmetric " + "map, regardless of how the coordinates of a pair are ordered. " + "`duplex`: Incoming data contains upper- and lower-triangle duplicates. " + "All input records that map to the lower triangle will be discarded! | " + "If you wish to treat lower- and upper-triangle input data as " + "distinct, use the ``--no-symmetric-upper`` option. ", + show_default=True, +) +@click.option( + "--field", + help="Specify quantitative input fields to aggregate into value columns " + "using the syntax ``--field =``. " + "Optionally, append ``:`` followed by ``dtype=`` to specify " + "the data type (e.g. float), and/or ``agg=`` to " + "specify an aggregation function different from sum (e.g. mean). " + "Field numbers are 1-based. Passing 'count' as the target name will " + "override the default behavior of storing pair counts. " + "Repeat the ``--field`` option for each additional field.", + type=str, + multiple=True, +) +# @click.option( +# "--no-count", +# help="Do not store the pair counts. Use this only if you use `--field` to " +# "specify at least one input field for aggregation as an alternative.", +# is_flag=True, +# default=False) +@click.option( + "--chunksize", + "-c", + help="Size in number of lines/records of data chunks to read and process " + "from the input stream at a time. These chunks will be saved as temporary " + "partial coolers and then merged.", + type=int, + default=15_000_000, +) +@click.option( + "--mergebuf", + help="Total number of pixel records to buffer per epoch of merging data. " + "Defaults to the same value as `chunksize`.", + type=int, +) +@click.option( + "--max-merge", + help="Maximum number of chunks to merge in a single pass.", + type=int, + default=200, + show_default=True, +) +@click.option( + "--temp-dir", + help="Create temporary files in a specified directory. Pass ``-`` to use " + "the platform default temp dir.", + type=click.Path(exists=True, file_okay=False, dir_okay=True, allow_dash=True), +) +@click.option( + "--no-delete-temp", + help="Do not delete temporary files when finished.", + is_flag=True, + default=False, +) +@click.option( + "--storage-options", + help="Options to modify the data filter pipeline. Provide as a " + "comma-separated list of key-value pairs of the form 'k1=v1,k2=v2,...'. " + "See http://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline " + "for more details.", +) +@click.option( + "--append", + "-a", + is_flag=True, + default=False, + help="Pass this flag to append the output cooler to an existing file " + "instead of overwriting the file.", +) +# @click.option( +# "--format", "-f", +# help="Preset data format.", +# type=click.Choice(['4DN', 'BEDPE'])) +# --sep +def pairs( + bins, + pairs_path, + cool_path, + metadata, + assembly, + zero_based, + comment_char, + input_copy_status, + no_symmetric_upper, + field, + chunksize, + mergebuf, + temp_dir, + no_delete_temp, + max_merge, + storage_options, + append, + **kwargs, +): + """ + Bin any text file or stream of pairs. + + Pairs data need not be sorted. Accepts compressed files. + To pipe input from stdin, set PAIRS_PATH to '-'. + + {} + + """ + chromsizes, bins = parse_bins(bins) + if mergebuf is None: + mergebuf = chunksize + + symmetric_upper = not no_symmetric_upper + tril_action = None + if symmetric_upper: + if input_copy_status == "unique": + tril_action = "reflect" + elif input_copy_status == "duplex": + tril_action = "drop" + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + input_field_names = [ + "chrom1", + "pos1", + "chrom2", + "pos2", + ] + input_field_dtypes = { + "chrom1": str, + "pos1": np.int64, + "chrom2": str, + "pos2": np.int64, + } + input_field_numbers = {} + for name in ["chrom1", "pos1", "chrom2", "pos2"]: + if kwargs[name] == 0: + raise click.BadParameter("Field numbers start at 1", param_hint=name) + input_field_numbers[name] = kwargs[name] - 1 + + # Include input value columns + output_field_names = [] + output_field_dtypes = {} + aggregations = {} + if len(field): + for arg in field: + name, colnum, dtype, agg = parse_field_param(arg) + if colnum is None: + if ( + agg is None + and dtype is not None + and name in {"bin1_id", "bin2_id", "count"} + ): + output_field_dtypes[name] = dtype + continue + else: + raise click.BadParameter( + "A field number is required.", param_hint=arg + ) + + if name not in input_field_names: + input_field_names.append(name) + + if name not in output_field_names: + output_field_names.append(name) + + input_field_numbers[name] = colnum + + if dtype is not None: + input_field_dtypes[name] = dtype + output_field_dtypes[name] = dtype + + if agg is not None: + aggregations[name] = agg + else: + aggregations[name] = "sum" + + if "count" not in output_field_names: + output_field_names.append("count") + + # Validate and get the stream + f_in = validate_pairs_columns(pairs_path, input_field_numbers, is_stdin=(pairs_path == "-")) + + # Customize the HDF5 filters + if storage_options is not None: + h5opts = parse_kv_list_param(storage_options) + for key in h5opts: + if isinstance(h5opts[key], list): + h5opts[key] = tuple(h5opts[key]) + else: + h5opts = None + + # Explicitly set kwargs to empty to avoid passing Click options to read_csv + kwargs = {} + + reader = pd.read_csv( + f_in, + sep="\t", + usecols=[input_field_numbers[name] for name in input_field_names], + names=input_field_names, + dtype=input_field_dtypes, + iterator=True, + chunksize=chunksize, + **kwargs, + ) + + sanitize = sanitize_records( + bins, + schema="pairs", + decode_chroms=True, + is_one_based=not zero_based, + tril_action=tril_action, + sort=True, + validate=True, + ) + aggregate = aggregate_records(agg=aggregations, count=True, sort=False) + pipeline = compose(aggregate, sanitize) + + create_cooler( + cool_path, + bins, + map(pipeline, reader), + columns=output_field_names, + dtypes=output_field_dtypes, + metadata=metadata, + assembly=assembly, + mergebuf=mergebuf, + max_merge=max_merge, + temp_dir=temp_dir, + delete_temp=not no_delete_temp, + boundscheck=False, + triucheck=False, + dupcheck=False, + ensure_sorted=False, + symmetric_upper=symmetric_upper, + h5opts=h5opts, + ordered=False, + mode="a" if append else "w", + ) From 8a50d2497fc9c8dfbc8eba88975d68cfbe187b9d Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 12 Mar 2025 19:19:03 +0530 Subject: [PATCH 11/16] Add tests for cload pairs column validation (#135) - Created 7 pytest cases in test_cload.py to test valid/invalid indices, stdin, headers, empty files, and extra fields. - Ensures validate_pairs_columns and pairs handle file and stdin inputs correctly. --- tests/test_cload.py | 210 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 tests/test_cload.py diff --git a/tests/test_cload.py b/tests/test_cload.py new file mode 100644 index 00000000..563089b9 --- /dev/null +++ b/tests/test_cload.py @@ -0,0 +1,210 @@ +from click.testing import CliRunner + +from cooler.cli.cload import pairs + +runner = CliRunner() + + +def test_valid_indices(tmpdir): + """Test that valid column indices process successfully.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\nchr1\t1000000\t2000000\nchr2\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\nchr1\t1500\tchr2\t3000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + +def test_invalid_index(tmpdir): + """Test that an invalid column index raises an error.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "5", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) + + +def test_stdin_valid(tmpdir): + """Test valid indices with stdin input.""" + bins_path = tmpdir.join("bins.bed") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_data = "chr1\t1000\tchr1\t2000\n" + + result = runner.invoke( + pairs, + [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + "-", + str(out_path), + ], + input=pairs_data, + ) + + assert result.exit_code == 0, f"Command failed: {result.output} {result.exception}" + + +def test_stdin_invalid(tmpdir): + """Test invalid index with stdin raises an error.""" + bins_path = tmpdir.join("bins.bed") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_data = "chr1\t1000\tchr1\t2000\n" + + result = runner.invoke( + pairs, + [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "5", + str(bins_path), + "-", + str(out_path), + ], + input=pairs_data, + ) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) + + +def test_header_skipping(tmpdir): + """Test that headers are skipped and validation works.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("#comment1\n#comment2\nchr1\t1000\tchr1\t2000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + +def test_empty_file(tmpdir): + """Test that an empty file or header-only file raises an error.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("#comment1\n#comment2\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert "is empty or contains only header lines" in str(result.exception), ( + f"Unexpected error: {result.exception}" + ) + + +def test_extra_field(tmpdir): + """Test validation with an additional --field column.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\nchr2\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\t1.5\nchr1\t1500\tchr2\t3000\t2.0\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + "--field", "value=5", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + # Test invalid extra field + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + "--field", "value=6", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 6 (value) exceeds number of columns (5)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) From 35df58ef16c5bbf4ab3c75cca2dd5bc8a0bde9e3 Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 12 Mar 2025 19:23:15 +0530 Subject: [PATCH 12/16] Switched tst_cload.py to LF. --- tests/test_cload.py | 420 ++++++++++++++++++++++---------------------- 1 file changed, 210 insertions(+), 210 deletions(-) diff --git a/tests/test_cload.py b/tests/test_cload.py index 563089b9..d0f8cfd0 100644 --- a/tests/test_cload.py +++ b/tests/test_cload.py @@ -1,210 +1,210 @@ -from click.testing import CliRunner - -from cooler.cli.cload import pairs - -runner = CliRunner() - - -def test_valid_indices(tmpdir): - """Test that valid column indices process successfully.""" - bins_path = tmpdir.join("bins.bed") - pairs_path = tmpdir.join("pairs.txt") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\nchr1\t1000000\t2000000\nchr2\t0\t1000000\n") - pairs_path.write("chr1\t1000\tchr1\t2000\nchr1\t1500\tchr2\t3000\n") - - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code == 0, f"Command failed: {result.output}" - - -def test_invalid_index(tmpdir): - """Test that an invalid column index raises an error.""" - bins_path = tmpdir.join("bins.bed") - pairs_path = tmpdir.join("pairs.txt") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\n") - pairs_path.write("chr1\t1000\tchr1\t2000\n") - - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "5", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code != 0, "Command should have failed" - assert isinstance(result.exception, ValueError), ( - f"Expected ValueError, got {result.exception}" - ) - assert ( - "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) - ), ( - f"Unexpected error: {result.exception}" - ) - - -def test_stdin_valid(tmpdir): - """Test valid indices with stdin input.""" - bins_path = tmpdir.join("bins.bed") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\n") - pairs_data = "chr1\t1000\tchr1\t2000\n" - - result = runner.invoke( - pairs, - [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - str(bins_path), - "-", - str(out_path), - ], - input=pairs_data, - ) - - assert result.exit_code == 0, f"Command failed: {result.output} {result.exception}" - - -def test_stdin_invalid(tmpdir): - """Test invalid index with stdin raises an error.""" - bins_path = tmpdir.join("bins.bed") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\n") - pairs_data = "chr1\t1000\tchr1\t2000\n" - - result = runner.invoke( - pairs, - [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "5", - str(bins_path), - "-", - str(out_path), - ], - input=pairs_data, - ) - - assert result.exit_code != 0, "Command should have failed" - assert isinstance(result.exception, ValueError), ( - f"Expected ValueError, got {result.exception}" - ) - assert ( - "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) - ), ( - f"Unexpected error: {result.exception}" - ) - - -def test_header_skipping(tmpdir): - """Test that headers are skipped and validation works.""" - bins_path = tmpdir.join("bins.bed") - pairs_path = tmpdir.join("pairs.txt") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\n") - pairs_path.write("#comment1\n#comment2\nchr1\t1000\tchr1\t2000\n") - - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code == 0, f"Command failed: {result.output}" - - -def test_empty_file(tmpdir): - """Test that an empty file or header-only file raises an error.""" - bins_path = tmpdir.join("bins.bed") - pairs_path = tmpdir.join("pairs.txt") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\n") - pairs_path.write("#comment1\n#comment2\n") - - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code != 0, "Command should have failed" - assert isinstance(result.exception, ValueError), ( - f"Expected ValueError, got {result.exception}" - ) - assert "is empty or contains only header lines" in str(result.exception), ( - f"Unexpected error: {result.exception}" - ) - - -def test_extra_field(tmpdir): - """Test validation with an additional --field column.""" - bins_path = tmpdir.join("bins.bed") - pairs_path = tmpdir.join("pairs.txt") - out_path = tmpdir.join("out.cool") - - bins_path.write("chr1\t0\t1000000\nchr2\t0\t1000000\n") - pairs_path.write("chr1\t1000\tchr1\t2000\t1.5\nchr1\t1500\tchr2\t3000\t2.0\n") - - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - "--field", "value=5", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code == 0, f"Command failed: {result.output}" - - # Test invalid extra field - result = runner.invoke(pairs, [ - "-c1", "1", - "-p1", "2", - "-c2", "3", - "-p2", "4", - "--field", "value=6", - str(bins_path), - str(pairs_path), - str(out_path), - ]) - - assert result.exit_code != 0, "Command should have failed" - assert isinstance(result.exception, ValueError), ( - f"Expected ValueError, got {result.exception}" - ) - assert ( - "Column index 6 (value) exceeds number of columns (5)" in str(result.exception) - ), ( - f"Unexpected error: {result.exception}" - ) +from click.testing import CliRunner + +from cooler.cli.cload import pairs + +runner = CliRunner() + + +def test_valid_indices(tmpdir): + """Test that valid column indices process successfully.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\nchr1\t1000000\t2000000\nchr2\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\nchr1\t1500\tchr2\t3000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + +def test_invalid_index(tmpdir): + """Test that an invalid column index raises an error.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "5", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) + + +def test_stdin_valid(tmpdir): + """Test valid indices with stdin input.""" + bins_path = tmpdir.join("bins.bed") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_data = "chr1\t1000\tchr1\t2000\n" + + result = runner.invoke( + pairs, + [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + "-", + str(out_path), + ], + input=pairs_data, + ) + + assert result.exit_code == 0, f"Command failed: {result.output} {result.exception}" + + +def test_stdin_invalid(tmpdir): + """Test invalid index with stdin raises an error.""" + bins_path = tmpdir.join("bins.bed") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_data = "chr1\t1000\tchr1\t2000\n" + + result = runner.invoke( + pairs, + [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "5", + str(bins_path), + "-", + str(out_path), + ], + input=pairs_data, + ) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 5 (pos2) exceeds number of columns (4)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) + + +def test_header_skipping(tmpdir): + """Test that headers are skipped and validation works.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("#comment1\n#comment2\nchr1\t1000\tchr1\t2000\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + +def test_empty_file(tmpdir): + """Test that an empty file or header-only file raises an error.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\n") + pairs_path.write("#comment1\n#comment2\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert "is empty or contains only header lines" in str(result.exception), ( + f"Unexpected error: {result.exception}" + ) + + +def test_extra_field(tmpdir): + """Test validation with an additional --field column.""" + bins_path = tmpdir.join("bins.bed") + pairs_path = tmpdir.join("pairs.txt") + out_path = tmpdir.join("out.cool") + + bins_path.write("chr1\t0\t1000000\nchr2\t0\t1000000\n") + pairs_path.write("chr1\t1000\tchr1\t2000\t1.5\nchr1\t1500\tchr2\t3000\t2.0\n") + + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + "--field", "value=5", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code == 0, f"Command failed: {result.output}" + + # Test invalid extra field + result = runner.invoke(pairs, [ + "-c1", "1", + "-p1", "2", + "-c2", "3", + "-p2", "4", + "--field", "value=6", + str(bins_path), + str(pairs_path), + str(out_path), + ]) + + assert result.exit_code != 0, "Command should have failed" + assert isinstance(result.exception, ValueError), ( + f"Expected ValueError, got {result.exception}" + ) + assert ( + "Column index 6 (value) exceeds number of columns (5)" in str(result.exception) + ), ( + f"Unexpected error: {result.exception}" + ) From 3679c600a967b893cf4d6e34a622b631b420aaef Mon Sep 17 00:00:00 2001 From: ShigrafS Date: Wed, 12 Mar 2025 19:24:31 +0530 Subject: [PATCH 13/16] Switched cload.py to LF. --- src/cooler/cli/cload.py | 1350 +++++++++++++++++++-------------------- 1 file changed, 675 insertions(+), 675 deletions(-) diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py index 1f77ab1a..1b2915ca 100644 --- a/src/cooler/cli/cload.py +++ b/src/cooler/cli/cload.py @@ -1,675 +1,675 @@ -import sys - -import click -from .. import create_cooler -import h5py -import io -import numpy as np -import pandas as pd -import simplejson as json -from cytoolz import compose -from multiprocess import Pool - -from ..create import ( - HDF5Aggregator, - PairixAggregator, - TabixAggregator, - aggregate_records, - create_cooler, - sanitize_records, -) -from . import cli, get_logger -from ._util import parse_bins, parse_field_param, parse_kv_list_param - -_pandas_version = pd.__version__.split(".") -if int(_pandas_version[0]) > 0: - from pandas.io.common import get_handle - - -def get_header(instream, comment_char="#"): - """Returns a header from the stream and the remainder of the stream - with the actual data. - - Parameters - ---------- - instream : a file object - An input stream. - comment_char : str - The character prepended to header lines (use '@' when parsing sams, - '#' when parsing pairsams). - Returns - ------- - header : list - The header lines, stripped of terminal spaces and newline characters. - remainder_stream : stream/file-like object - Stream with the remaining lines. - """ - if not comment_char: - raise ValueError("Please, provide a comment char!") - - header = [] - # Buffer the input stream to handle non-peekable streams like StringIO - buffered_lines = [] - readline_f = instream.readline - - while True: - line = readline_f() - if not line: # End of stream - break - if isinstance(line, bytes): - line = line.decode() - if not line.startswith(comment_char): - buffered_lines.append(line) - break - header.append(line.strip()) - - # Create a new stream with the remaining data - from io import StringIO - remainder_stream = StringIO("".join(buffered_lines) + instream.read()) if buffered_lines else instream - return header, remainder_stream - -def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> io.StringIO | io.TextIOBase: - """ - Validate that column indices for chrom1, pos1, chrom2, pos2, and any additional fields - do not exceed the number of columns in the pairs file. Returns the stream positioned after headers. - - Args: - file_path: Path to the pairs file or '-' for stdin. - field_numbers: Dictionary mapping field names (e.g., 'chrom1') to zero-based column indices. - is_stdin: True if input is from stdin. - - Returns: - File-like object positioned at the start of data. - - Raises: - ValueError: If any column index exceeds the number of columns in the file. - """ - if is_stdin: - input_data = sys.stdin.read() if hasattr(sys.stdin, 'read') else sys.stdin.getvalue() - f = io.StringIO(input_data) - else: - f = open(file_path, 'r') - - _, f = get_header(f, comment_char="#") - pos = f.tell() - first_data_line = f.readline().strip() - f.seek(pos) # Rewind to preserve the stream - if not first_data_line: - raise ValueError(f"Pairs file '{file_path}' is empty or contains only header lines") - - num_cols = len(first_data_line.split("\t")) - for name, idx in field_numbers.items(): - if idx >= num_cols: - raise ValueError( - f"Column index {idx + 1} ({name}) exceeds number of columns ({num_cols}) in '{file_path}'" - ) - return f - -@cli.group() -def cload(): - """ - Create a cooler from genomic pairs and bins. - - Choose a subcommand based on the format of the input contact list. - - """ - pass - - -# flake8: noqa -def register_subcommand(func): - return cload.command()( - click.argument("bins", type=str, metavar="BINS")( - click.argument( - "pairs_path", - type=click.Path(exists=True, allow_dash=True), - metavar="PAIRS_PATH", - )( - click.argument("cool_path", metavar="COOL_PATH")( - click.option( - "--metadata", help="Path to JSON file containing user metadata." - )( - click.option( - "--assembly", - help="Name of genome assembly (e.g. hg19, mm10)", - )(func) - ) - ) - ) - ) - ) - - -def add_arg_help(func): - func.__doc__ = func.__doc__.format( - """ - BINS : One of the following - - : 1. Path to a chromsizes file, 2. Bin size in bp - - : Path to BED file defining the genomic bin segmentation. - - PAIRS_PATH : Path to contacts (i.e. read pairs) file. - - COOL_PATH : Output COOL file path or URI.""" - ) - return func - - -@register_subcommand -@add_arg_help -@click.option( - "--chunksize", - "-c", - help="Control the number of pixels handled by each worker process at a time.", - type=int, - default=int(100e6), - show_default=True, -) -def hiclib(bins, pairs_path, cool_path, metadata, assembly, chunksize): - """ - Bin a hiclib HDF5 contact list (frag) file. - - {} - - hiclib on BitBucket: . - - """ - - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - with h5py.File(pairs_path, "r") as h5pairs: - iterator = HDF5Aggregator(h5pairs, chromsizes, bins, chunksize) - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - - -@register_subcommand -@add_arg_help -@click.option( - "--nproc", - "-p", - help="Number of processes to split the work between.", - type=int, - default=8, - show_default=True, -) -@click.option( - "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, default=4 -) -@click.option( - "--pos2", "-p2", help="pos2 field number (one-based)", type=int, default=5 -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--max-split", - "-s", - help="Divide the pairs from each chromosome into at most this many chunks. " - "Smaller chromosomes will be split less frequently or not at all. " - "Increase ths value if large chromosomes dominate the workload on " - "multiple processors.", - type=int, - default=2, - show_default=True, -) -def tabix( - bins, - pairs_path, - cool_path, - metadata, - assembly, - nproc, - zero_based, - max_split, - **kwargs, -): - """ - Bin a tabix-indexed contact list file. - - {} - - See also: 'cooler csort' to sort and index a contact list file - - Tabix manpage: . - - """ - logger = get_logger(__name__) - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - try: - map_func = map - if nproc > 1: - pool = Pool(nproc) - logger.info(f"Using {nproc} cores") - map_func = pool.imap - - opts = {} - if "chrom2" in kwargs: - opts["C2"] = kwargs["chrom2"] - 1 - if "pos2" in kwargs: - opts["P2"] = kwargs["pos2"] - 1 - - iterator = TabixAggregator( - pairs_path, - chromsizes, - bins, - map=map_func, - is_one_based=(not zero_based), - n_chunks=max_split, - **opts, - ) - - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - finally: - if nproc > 1: - pool.close() - - -@register_subcommand -@add_arg_help -@click.option( - "--nproc", - "-p", - help="Number of processes to split the work between.", - type=int, - default=8, - show_default=True, -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--max-split", - "-s", - help="Divide the pairs from each chromosome into at most this many chunks. " - "Smaller chromosomes will be split less frequently or not at all. " - "Increase ths value if large chromosomes dominate the workload on " - "multiple processors.", - type=int, - default=2, - show_default=True, -) -@click.option( - "--block-char", - help="Character separating contig names in the block names of the pairix " - "index.", - type=str, - default="|", - show_default=True, -) -def pairix( - bins, - pairs_path, - cool_path, - metadata, - assembly, - nproc, - zero_based, - max_split, - block_char, -): - """ - Bin a pairix-indexed contact list file. - - {} - - See also: 'cooler csort' to sort and index a contact list file - - Pairix on GitHub: . - - """ - logger = get_logger(__name__) - chromsizes, bins = parse_bins(bins) - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - try: - map_func = map - if nproc > 1: - pool = Pool(nproc) - logger.info(f"Using {nproc} cores") - map_func = pool.imap - - iterator = PairixAggregator( - pairs_path, - chromsizes, - bins, - map=map_func, - is_one_based=(not zero_based), - n_chunks=max_split, - block_char=block_char, - ) - - create_cooler( - cool_path, - bins, - iterator, - metadata=metadata, - assembly=assembly, - ordered=True, - ) - finally: - if nproc > 1: - pool.close() - - -@register_subcommand -@add_arg_help -@click.option( - "--chrom1", "-c1", help="chrom1 field number (one-based)", type=int, required=True -) -@click.option( - "--pos1", "-p1", help="pos1 field number (one-based)", type=int, required=True -) -@click.option( - "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, required=True -) -@click.option( - "--pos2", "-p2", help="pos2 field number (one-based)", type=int, required=True -) -@click.option( - "--zero-based", - "-0", - help="Positions are zero-based", - is_flag=True, - default=False, - show_default=True, -) -@click.option( - "--comment-char", - type=str, - default="#", - show_default=True, - help="Comment character that indicates lines to ignore.", -) -@click.option( - "--no-symmetric-upper", - "-N", - help="Create a complete square matrix without implicit symmetry. " - "This allows for distinct upper- and lower-triangle values", - is_flag=True, - default=False, -) -@click.option( - "--input-copy-status", - type=click.Choice(["unique", "duplex"]), - default="unique", - help="Copy status of input data when using symmetric-upper storage. | " - "`unique`: Incoming data comes from a unique half of a symmetric " - "map, regardless of how the coordinates of a pair are ordered. " - "`duplex`: Incoming data contains upper- and lower-triangle duplicates. " - "All input records that map to the lower triangle will be discarded! | " - "If you wish to treat lower- and upper-triangle input data as " - "distinct, use the ``--no-symmetric-upper`` option. ", - show_default=True, -) -@click.option( - "--field", - help="Specify quantitative input fields to aggregate into value columns " - "using the syntax ``--field =``. " - "Optionally, append ``:`` followed by ``dtype=`` to specify " - "the data type (e.g. float), and/or ``agg=`` to " - "specify an aggregation function different from sum (e.g. mean). " - "Field numbers are 1-based. Passing 'count' as the target name will " - "override the default behavior of storing pair counts. " - "Repeat the ``--field`` option for each additional field.", - type=str, - multiple=True, -) -# @click.option( -# "--no-count", -# help="Do not store the pair counts. Use this only if you use `--field` to " -# "specify at least one input field for aggregation as an alternative.", -# is_flag=True, -# default=False) -@click.option( - "--chunksize", - "-c", - help="Size in number of lines/records of data chunks to read and process " - "from the input stream at a time. These chunks will be saved as temporary " - "partial coolers and then merged.", - type=int, - default=15_000_000, -) -@click.option( - "--mergebuf", - help="Total number of pixel records to buffer per epoch of merging data. " - "Defaults to the same value as `chunksize`.", - type=int, -) -@click.option( - "--max-merge", - help="Maximum number of chunks to merge in a single pass.", - type=int, - default=200, - show_default=True, -) -@click.option( - "--temp-dir", - help="Create temporary files in a specified directory. Pass ``-`` to use " - "the platform default temp dir.", - type=click.Path(exists=True, file_okay=False, dir_okay=True, allow_dash=True), -) -@click.option( - "--no-delete-temp", - help="Do not delete temporary files when finished.", - is_flag=True, - default=False, -) -@click.option( - "--storage-options", - help="Options to modify the data filter pipeline. Provide as a " - "comma-separated list of key-value pairs of the form 'k1=v1,k2=v2,...'. " - "See http://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline " - "for more details.", -) -@click.option( - "--append", - "-a", - is_flag=True, - default=False, - help="Pass this flag to append the output cooler to an existing file " - "instead of overwriting the file.", -) -# @click.option( -# "--format", "-f", -# help="Preset data format.", -# type=click.Choice(['4DN', 'BEDPE'])) -# --sep -def pairs( - bins, - pairs_path, - cool_path, - metadata, - assembly, - zero_based, - comment_char, - input_copy_status, - no_symmetric_upper, - field, - chunksize, - mergebuf, - temp_dir, - no_delete_temp, - max_merge, - storage_options, - append, - **kwargs, -): - """ - Bin any text file or stream of pairs. - - Pairs data need not be sorted. Accepts compressed files. - To pipe input from stdin, set PAIRS_PATH to '-'. - - {} - - """ - chromsizes, bins = parse_bins(bins) - if mergebuf is None: - mergebuf = chunksize - - symmetric_upper = not no_symmetric_upper - tril_action = None - if symmetric_upper: - if input_copy_status == "unique": - tril_action = "reflect" - elif input_copy_status == "duplex": - tril_action = "drop" - - if metadata is not None: - with open(metadata) as f: - metadata = json.load(f) - - input_field_names = [ - "chrom1", - "pos1", - "chrom2", - "pos2", - ] - input_field_dtypes = { - "chrom1": str, - "pos1": np.int64, - "chrom2": str, - "pos2": np.int64, - } - input_field_numbers = {} - for name in ["chrom1", "pos1", "chrom2", "pos2"]: - if kwargs[name] == 0: - raise click.BadParameter("Field numbers start at 1", param_hint=name) - input_field_numbers[name] = kwargs[name] - 1 - - # Include input value columns - output_field_names = [] - output_field_dtypes = {} - aggregations = {} - if len(field): - for arg in field: - name, colnum, dtype, agg = parse_field_param(arg) - if colnum is None: - if ( - agg is None - and dtype is not None - and name in {"bin1_id", "bin2_id", "count"} - ): - output_field_dtypes[name] = dtype - continue - else: - raise click.BadParameter( - "A field number is required.", param_hint=arg - ) - - if name not in input_field_names: - input_field_names.append(name) - - if name not in output_field_names: - output_field_names.append(name) - - input_field_numbers[name] = colnum - - if dtype is not None: - input_field_dtypes[name] = dtype - output_field_dtypes[name] = dtype - - if agg is not None: - aggregations[name] = agg - else: - aggregations[name] = "sum" - - if "count" not in output_field_names: - output_field_names.append("count") - - # Validate and get the stream - f_in = validate_pairs_columns(pairs_path, input_field_numbers, is_stdin=(pairs_path == "-")) - - # Customize the HDF5 filters - if storage_options is not None: - h5opts = parse_kv_list_param(storage_options) - for key in h5opts: - if isinstance(h5opts[key], list): - h5opts[key] = tuple(h5opts[key]) - else: - h5opts = None - - # Explicitly set kwargs to empty to avoid passing Click options to read_csv - kwargs = {} - - reader = pd.read_csv( - f_in, - sep="\t", - usecols=[input_field_numbers[name] for name in input_field_names], - names=input_field_names, - dtype=input_field_dtypes, - iterator=True, - chunksize=chunksize, - **kwargs, - ) - - sanitize = sanitize_records( - bins, - schema="pairs", - decode_chroms=True, - is_one_based=not zero_based, - tril_action=tril_action, - sort=True, - validate=True, - ) - aggregate = aggregate_records(agg=aggregations, count=True, sort=False) - pipeline = compose(aggregate, sanitize) - - create_cooler( - cool_path, - bins, - map(pipeline, reader), - columns=output_field_names, - dtypes=output_field_dtypes, - metadata=metadata, - assembly=assembly, - mergebuf=mergebuf, - max_merge=max_merge, - temp_dir=temp_dir, - delete_temp=not no_delete_temp, - boundscheck=False, - triucheck=False, - dupcheck=False, - ensure_sorted=False, - symmetric_upper=symmetric_upper, - h5opts=h5opts, - ordered=False, - mode="a" if append else "w", - ) +import sys + +import click +from .. import create_cooler +import h5py +import io +import numpy as np +import pandas as pd +import simplejson as json +from cytoolz import compose +from multiprocess import Pool + +from ..create import ( + HDF5Aggregator, + PairixAggregator, + TabixAggregator, + aggregate_records, + create_cooler, + sanitize_records, +) +from . import cli, get_logger +from ._util import parse_bins, parse_field_param, parse_kv_list_param + +_pandas_version = pd.__version__.split(".") +if int(_pandas_version[0]) > 0: + from pandas.io.common import get_handle + + +def get_header(instream, comment_char="#"): + """Returns a header from the stream and the remainder of the stream + with the actual data. + + Parameters + ---------- + instream : a file object + An input stream. + comment_char : str + The character prepended to header lines (use '@' when parsing sams, + '#' when parsing pairsams). + Returns + ------- + header : list + The header lines, stripped of terminal spaces and newline characters. + remainder_stream : stream/file-like object + Stream with the remaining lines. + """ + if not comment_char: + raise ValueError("Please, provide a comment char!") + + header = [] + # Buffer the input stream to handle non-peekable streams like StringIO + buffered_lines = [] + readline_f = instream.readline + + while True: + line = readline_f() + if not line: # End of stream + break + if isinstance(line, bytes): + line = line.decode() + if not line.startswith(comment_char): + buffered_lines.append(line) + break + header.append(line.strip()) + + # Create a new stream with the remaining data + from io import StringIO + remainder_stream = StringIO("".join(buffered_lines) + instream.read()) if buffered_lines else instream + return header, remainder_stream + +def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> io.StringIO | io.TextIOBase: + """ + Validate that column indices for chrom1, pos1, chrom2, pos2, and any additional fields + do not exceed the number of columns in the pairs file. Returns the stream positioned after headers. + + Args: + file_path: Path to the pairs file or '-' for stdin. + field_numbers: Dictionary mapping field names (e.g., 'chrom1') to zero-based column indices. + is_stdin: True if input is from stdin. + + Returns: + File-like object positioned at the start of data. + + Raises: + ValueError: If any column index exceeds the number of columns in the file. + """ + if is_stdin: + input_data = sys.stdin.read() if hasattr(sys.stdin, 'read') else sys.stdin.getvalue() + f = io.StringIO(input_data) + else: + f = open(file_path, 'r') + + _, f = get_header(f, comment_char="#") + pos = f.tell() + first_data_line = f.readline().strip() + f.seek(pos) # Rewind to preserve the stream + if not first_data_line: + raise ValueError(f"Pairs file '{file_path}' is empty or contains only header lines") + + num_cols = len(first_data_line.split("\t")) + for name, idx in field_numbers.items(): + if idx >= num_cols: + raise ValueError( + f"Column index {idx + 1} ({name}) exceeds number of columns ({num_cols}) in '{file_path}'" + ) + return f + +@cli.group() +def cload(): + """ + Create a cooler from genomic pairs and bins. + + Choose a subcommand based on the format of the input contact list. + + """ + pass + + +# flake8: noqa +def register_subcommand(func): + return cload.command()( + click.argument("bins", type=str, metavar="BINS")( + click.argument( + "pairs_path", + type=click.Path(exists=True, allow_dash=True), + metavar="PAIRS_PATH", + )( + click.argument("cool_path", metavar="COOL_PATH")( + click.option( + "--metadata", help="Path to JSON file containing user metadata." + )( + click.option( + "--assembly", + help="Name of genome assembly (e.g. hg19, mm10)", + )(func) + ) + ) + ) + ) + ) + + +def add_arg_help(func): + func.__doc__ = func.__doc__.format( + """ + BINS : One of the following + + : 1. Path to a chromsizes file, 2. Bin size in bp + + : Path to BED file defining the genomic bin segmentation. + + PAIRS_PATH : Path to contacts (i.e. read pairs) file. + + COOL_PATH : Output COOL file path or URI.""" + ) + return func + + +@register_subcommand +@add_arg_help +@click.option( + "--chunksize", + "-c", + help="Control the number of pixels handled by each worker process at a time.", + type=int, + default=int(100e6), + show_default=True, +) +def hiclib(bins, pairs_path, cool_path, metadata, assembly, chunksize): + """ + Bin a hiclib HDF5 contact list (frag) file. + + {} + + hiclib on BitBucket: . + + """ + + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + with h5py.File(pairs_path, "r") as h5pairs: + iterator = HDF5Aggregator(h5pairs, chromsizes, bins, chunksize) + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + + +@register_subcommand +@add_arg_help +@click.option( + "--nproc", + "-p", + help="Number of processes to split the work between.", + type=int, + default=8, + show_default=True, +) +@click.option( + "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, default=4 +) +@click.option( + "--pos2", "-p2", help="pos2 field number (one-based)", type=int, default=5 +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--max-split", + "-s", + help="Divide the pairs from each chromosome into at most this many chunks. " + "Smaller chromosomes will be split less frequently or not at all. " + "Increase ths value if large chromosomes dominate the workload on " + "multiple processors.", + type=int, + default=2, + show_default=True, +) +def tabix( + bins, + pairs_path, + cool_path, + metadata, + assembly, + nproc, + zero_based, + max_split, + **kwargs, +): + """ + Bin a tabix-indexed contact list file. + + {} + + See also: 'cooler csort' to sort and index a contact list file + + Tabix manpage: . + + """ + logger = get_logger(__name__) + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + try: + map_func = map + if nproc > 1: + pool = Pool(nproc) + logger.info(f"Using {nproc} cores") + map_func = pool.imap + + opts = {} + if "chrom2" in kwargs: + opts["C2"] = kwargs["chrom2"] - 1 + if "pos2" in kwargs: + opts["P2"] = kwargs["pos2"] - 1 + + iterator = TabixAggregator( + pairs_path, + chromsizes, + bins, + map=map_func, + is_one_based=(not zero_based), + n_chunks=max_split, + **opts, + ) + + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + finally: + if nproc > 1: + pool.close() + + +@register_subcommand +@add_arg_help +@click.option( + "--nproc", + "-p", + help="Number of processes to split the work between.", + type=int, + default=8, + show_default=True, +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--max-split", + "-s", + help="Divide the pairs from each chromosome into at most this many chunks. " + "Smaller chromosomes will be split less frequently or not at all. " + "Increase ths value if large chromosomes dominate the workload on " + "multiple processors.", + type=int, + default=2, + show_default=True, +) +@click.option( + "--block-char", + help="Character separating contig names in the block names of the pairix " + "index.", + type=str, + default="|", + show_default=True, +) +def pairix( + bins, + pairs_path, + cool_path, + metadata, + assembly, + nproc, + zero_based, + max_split, + block_char, +): + """ + Bin a pairix-indexed contact list file. + + {} + + See also: 'cooler csort' to sort and index a contact list file + + Pairix on GitHub: . + + """ + logger = get_logger(__name__) + chromsizes, bins = parse_bins(bins) + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + try: + map_func = map + if nproc > 1: + pool = Pool(nproc) + logger.info(f"Using {nproc} cores") + map_func = pool.imap + + iterator = PairixAggregator( + pairs_path, + chromsizes, + bins, + map=map_func, + is_one_based=(not zero_based), + n_chunks=max_split, + block_char=block_char, + ) + + create_cooler( + cool_path, + bins, + iterator, + metadata=metadata, + assembly=assembly, + ordered=True, + ) + finally: + if nproc > 1: + pool.close() + + +@register_subcommand +@add_arg_help +@click.option( + "--chrom1", "-c1", help="chrom1 field number (one-based)", type=int, required=True +) +@click.option( + "--pos1", "-p1", help="pos1 field number (one-based)", type=int, required=True +) +@click.option( + "--chrom2", "-c2", help="chrom2 field number (one-based)", type=int, required=True +) +@click.option( + "--pos2", "-p2", help="pos2 field number (one-based)", type=int, required=True +) +@click.option( + "--zero-based", + "-0", + help="Positions are zero-based", + is_flag=True, + default=False, + show_default=True, +) +@click.option( + "--comment-char", + type=str, + default="#", + show_default=True, + help="Comment character that indicates lines to ignore.", +) +@click.option( + "--no-symmetric-upper", + "-N", + help="Create a complete square matrix without implicit symmetry. " + "This allows for distinct upper- and lower-triangle values", + is_flag=True, + default=False, +) +@click.option( + "--input-copy-status", + type=click.Choice(["unique", "duplex"]), + default="unique", + help="Copy status of input data when using symmetric-upper storage. | " + "`unique`: Incoming data comes from a unique half of a symmetric " + "map, regardless of how the coordinates of a pair are ordered. " + "`duplex`: Incoming data contains upper- and lower-triangle duplicates. " + "All input records that map to the lower triangle will be discarded! | " + "If you wish to treat lower- and upper-triangle input data as " + "distinct, use the ``--no-symmetric-upper`` option. ", + show_default=True, +) +@click.option( + "--field", + help="Specify quantitative input fields to aggregate into value columns " + "using the syntax ``--field =``. " + "Optionally, append ``:`` followed by ``dtype=`` to specify " + "the data type (e.g. float), and/or ``agg=`` to " + "specify an aggregation function different from sum (e.g. mean). " + "Field numbers are 1-based. Passing 'count' as the target name will " + "override the default behavior of storing pair counts. " + "Repeat the ``--field`` option for each additional field.", + type=str, + multiple=True, +) +# @click.option( +# "--no-count", +# help="Do not store the pair counts. Use this only if you use `--field` to " +# "specify at least one input field for aggregation as an alternative.", +# is_flag=True, +# default=False) +@click.option( + "--chunksize", + "-c", + help="Size in number of lines/records of data chunks to read and process " + "from the input stream at a time. These chunks will be saved as temporary " + "partial coolers and then merged.", + type=int, + default=15_000_000, +) +@click.option( + "--mergebuf", + help="Total number of pixel records to buffer per epoch of merging data. " + "Defaults to the same value as `chunksize`.", + type=int, +) +@click.option( + "--max-merge", + help="Maximum number of chunks to merge in a single pass.", + type=int, + default=200, + show_default=True, +) +@click.option( + "--temp-dir", + help="Create temporary files in a specified directory. Pass ``-`` to use " + "the platform default temp dir.", + type=click.Path(exists=True, file_okay=False, dir_okay=True, allow_dash=True), +) +@click.option( + "--no-delete-temp", + help="Do not delete temporary files when finished.", + is_flag=True, + default=False, +) +@click.option( + "--storage-options", + help="Options to modify the data filter pipeline. Provide as a " + "comma-separated list of key-value pairs of the form 'k1=v1,k2=v2,...'. " + "See http://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline " + "for more details.", +) +@click.option( + "--append", + "-a", + is_flag=True, + default=False, + help="Pass this flag to append the output cooler to an existing file " + "instead of overwriting the file.", +) +# @click.option( +# "--format", "-f", +# help="Preset data format.", +# type=click.Choice(['4DN', 'BEDPE'])) +# --sep +def pairs( + bins, + pairs_path, + cool_path, + metadata, + assembly, + zero_based, + comment_char, + input_copy_status, + no_symmetric_upper, + field, + chunksize, + mergebuf, + temp_dir, + no_delete_temp, + max_merge, + storage_options, + append, + **kwargs, +): + """ + Bin any text file or stream of pairs. + + Pairs data need not be sorted. Accepts compressed files. + To pipe input from stdin, set PAIRS_PATH to '-'. + + {} + + """ + chromsizes, bins = parse_bins(bins) + if mergebuf is None: + mergebuf = chunksize + + symmetric_upper = not no_symmetric_upper + tril_action = None + if symmetric_upper: + if input_copy_status == "unique": + tril_action = "reflect" + elif input_copy_status == "duplex": + tril_action = "drop" + + if metadata is not None: + with open(metadata) as f: + metadata = json.load(f) + + input_field_names = [ + "chrom1", + "pos1", + "chrom2", + "pos2", + ] + input_field_dtypes = { + "chrom1": str, + "pos1": np.int64, + "chrom2": str, + "pos2": np.int64, + } + input_field_numbers = {} + for name in ["chrom1", "pos1", "chrom2", "pos2"]: + if kwargs[name] == 0: + raise click.BadParameter("Field numbers start at 1", param_hint=name) + input_field_numbers[name] = kwargs[name] - 1 + + # Include input value columns + output_field_names = [] + output_field_dtypes = {} + aggregations = {} + if len(field): + for arg in field: + name, colnum, dtype, agg = parse_field_param(arg) + if colnum is None: + if ( + agg is None + and dtype is not None + and name in {"bin1_id", "bin2_id", "count"} + ): + output_field_dtypes[name] = dtype + continue + else: + raise click.BadParameter( + "A field number is required.", param_hint=arg + ) + + if name not in input_field_names: + input_field_names.append(name) + + if name not in output_field_names: + output_field_names.append(name) + + input_field_numbers[name] = colnum + + if dtype is not None: + input_field_dtypes[name] = dtype + output_field_dtypes[name] = dtype + + if agg is not None: + aggregations[name] = agg + else: + aggregations[name] = "sum" + + if "count" not in output_field_names: + output_field_names.append("count") + + # Validate and get the stream + f_in = validate_pairs_columns(pairs_path, input_field_numbers, is_stdin=(pairs_path == "-")) + + # Customize the HDF5 filters + if storage_options is not None: + h5opts = parse_kv_list_param(storage_options) + for key in h5opts: + if isinstance(h5opts[key], list): + h5opts[key] = tuple(h5opts[key]) + else: + h5opts = None + + # Explicitly set kwargs to empty to avoid passing Click options to read_csv + kwargs = {} + + reader = pd.read_csv( + f_in, + sep="\t", + usecols=[input_field_numbers[name] for name in input_field_names], + names=input_field_names, + dtype=input_field_dtypes, + iterator=True, + chunksize=chunksize, + **kwargs, + ) + + sanitize = sanitize_records( + bins, + schema="pairs", + decode_chroms=True, + is_one_based=not zero_based, + tril_action=tril_action, + sort=True, + validate=True, + ) + aggregate = aggregate_records(agg=aggregations, count=True, sort=False) + pipeline = compose(aggregate, sanitize) + + create_cooler( + cool_path, + bins, + map(pipeline, reader), + columns=output_field_names, + dtypes=output_field_dtypes, + metadata=metadata, + assembly=assembly, + mergebuf=mergebuf, + max_merge=max_merge, + temp_dir=temp_dir, + delete_temp=not no_delete_temp, + boundscheck=False, + triucheck=False, + dupcheck=False, + ensure_sorted=False, + symmetric_upper=symmetric_upper, + h5opts=h5opts, + ordered=False, + mode="a" if append else "w", + ) From 9e4405b74d3a8a8c84c245764f4e5ca0e4a77d83 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:48:14 +0530 Subject: [PATCH 14/16] Replaced '|' operator in type hints with 'Union[...]' for compatibility with Python 3.9 in cload.py. --- src/cooler/cli/cload.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py index 1b2915ca..61efe104 100644 --- a/src/cooler/cli/cload.py +++ b/src/cooler/cli/cload.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import simplejson as json +from typing import Union from cytoolz import compose from multiprocess import Pool @@ -68,7 +69,7 @@ def get_header(instream, comment_char="#"): remainder_stream = StringIO("".join(buffered_lines) + instream.read()) if buffered_lines else instream return header, remainder_stream -def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> io.StringIO | io.TextIOBase: +def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> Union[io.StringIO, io.TextIOBase]: """ Validate that column indices for chrom1, pos1, chrom2, pos2, and any additional fields do not exceed the number of columns in the pairs file. Returns the stream positioned after headers. From 7f920b2ab4d9c756f870e6bc66a1370c73973c59 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:58:27 +0000 Subject: [PATCH 15/16] Fixed failing errors. --- src/cooler/util.py | 68 +++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/src/cooler/util.py b/src/cooler/util.py index cb933250..13e68eb2 100644 --- a/src/cooler/util.py +++ b/src/cooler/util.py @@ -226,34 +226,46 @@ def read_chromsizes( verbose : bool, optional Whether to enable verbose logging for diagnostics. """ - # Check if the input is a file-like object (StringIO or file path) and - # inspect the first line for delimiters - if isinstance(filepath_or, (str, io.StringIO)): - first_line = None - if isinstance(filepath_or, io.StringIO): - first_line = filepath_or.getvalue().splitlines()[0] - elif isinstance(filepath_or, str): - with open(filepath_or) as file: - first_line = file.readline() - - if first_line and ' ' in first_line: - raise ValueError( - f"Chromsizes file '{filepath_or}' uses spaces instead of tabs " - "as delimiters. Please use tabs.") - - # Read the chromosome size file into a DataFrame - # on_bad_lines="error" will raise an error if any row does not have exactly two - # columns. - chromtable = pd.read_csv( - filepath_or, - sep="\t", # Ensuring tab is the delimiter - usecols=[0, 1], - names=["name", "length"], - dtype={"name": str}, - on_bad_lines="error", - **kwargs, - ) - + # Handle URL case separately + if isinstance(filepath_or, str) and filepath_or.startswith(('http://', 'https://')): + try: + # Use pandas' built-in URL handling + chromtable = pd.read_csv( + filepath_or, + sep="\t", + usecols=[0, 1], + names=["name", "length"], + dtype={"name": str}, + on_bad_lines="error", + **kwargs, + ) + except Exception as e: + raise ValueError(f"Failed to fetch URL {filepath_or}: {e!s}") from e + else: + # Original validation for local files/StringIO + if isinstance(filepath_or, (str, io.StringIO)): + first_line = None + if isinstance(filepath_or, io.StringIO): + first_line = filepath_or.getvalue().splitlines()[0] + elif isinstance(filepath_or, str): + with open(filepath_or) as file: + first_line = file.readline() + + if first_line and ' ' in first_line: + raise ValueError( + f"Chromsizes file '{filepath_or}' uses spaces instead of tabs " + "as delimiters. Please use tabs.") + + # Read the chromosome size file into a DataFrame + chromtable = pd.read_csv( + filepath_or, + sep="\t", # Ensuring tab is the delimiter + usecols=[0, 1], + names=["name", "length"], + dtype={"name": str}, + on_bad_lines="error", + **kwargs, + ) # Convert the "length" column to numeric values, coercing errors to NaN chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce") From d876ca9a704cc56879c0754968df13d470c0fdd5 Mon Sep 17 00:00:00 2001 From: ShigrafS <140247389+ShigrafS@users.noreply.github.com> Date: Wed, 16 Apr 2025 19:39:07 +0000 Subject: [PATCH 16/16] Fixed failing tests. --- src/cooler/cli/cload.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py index 61efe104..d1f52af7 100644 --- a/src/cooler/cli/cload.py +++ b/src/cooler/cli/cload.py @@ -4,6 +4,7 @@ from .. import create_cooler import h5py import io +import gzip import numpy as np import pandas as pd import simplejson as json @@ -49,15 +50,16 @@ def get_header(instream, comment_char="#"): raise ValueError("Please, provide a comment char!") header = [] - # Buffer the input stream to handle non-peekable streams like StringIO buffered_lines = [] readline_f = instream.readline + is_bytes = False while True: line = readline_f() if not line: # End of stream break if isinstance(line, bytes): + is_bytes = True line = line.decode() if not line.startswith(comment_char): buffered_lines.append(line) @@ -65,10 +67,30 @@ def get_header(instream, comment_char="#"): header.append(line.strip()) # Create a new stream with the remaining data - from io import StringIO - remainder_stream = StringIO("".join(buffered_lines) + instream.read()) if buffered_lines else instream + from io import StringIO, BytesIO + if buffered_lines: + if is_bytes: + # For binary streams, read remaining data as bytes + remainder = instream.read() + if isinstance(remainder, str): + remainder = remainder.encode() + remainder_stream = BytesIO(b"".join([line.encode() for line in buffered_lines]) + remainder) + else: + # For text streams, read remaining data as text + remainder = instream.read() + if isinstance(remainder, bytes): + remainder = remainder.decode() + remainder_stream = StringIO("".join(buffered_lines) + remainder) + else: + remainder_stream = instream + return header, remainder_stream +def open_maybe_gzip(path, mode="rt"): + if isinstance(path, str) and path.endswith(".gz"): + return gzip.open(path, mode) + return open(path, mode) + def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_stdin: bool = False) -> Union[io.StringIO, io.TextIOBase]: """ Validate that column indices for chrom1, pos1, chrom2, pos2, and any additional fields @@ -89,11 +111,17 @@ def validate_pairs_columns(file_path: str, field_numbers: dict[str, int], is_std input_data = sys.stdin.read() if hasattr(sys.stdin, 'read') else sys.stdin.getvalue() f = io.StringIO(input_data) else: - f = open(file_path, 'r') + f = open_maybe_gzip(file_path, 'r') _, f = get_header(f, comment_char="#") pos = f.tell() - first_data_line = f.readline().strip() + + # Read first data line and check column count + first_data_line = f.readline() + if isinstance(first_data_line, bytes): + first_data_line = first_data_line.decode() + first_data_line = first_data_line.strip() + f.seek(pos) # Rewind to preserve the stream if not first_data_line: raise ValueError(f"Pairs file '{file_path}' is empty or contains only header lines")