Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
606aaec
pyarrow force 18.0.0
ypriverol Dec 1, 2024
4e4b8bb
testing
ypriverol Dec 1, 2024
740b4ab
remove ms2rescore.py
ypriverol Dec 1, 2024
7183557
remove ms2rescore.py
ypriverol Dec 1, 2024
08da89b
remove ms2rescore.py
ypriverol Dec 1, 2024
80d8184
remove ms2rescore.py
ypriverol Dec 1, 2024
582d648
remove ms2rescore.py
ypriverol Dec 1, 2024
7a7c8c3
remove ms2rescore.py
ypriverol Dec 1, 2024
2b11f90
remove ms2rescore.py
ypriverol Dec 1, 2024
6d7da40
remove ms2rescore.py
ypriverol Dec 2, 2024
502761e
remove ms2rescore.py
ypriverol Dec 2, 2024
2590939
remove ms2rescore.py
ypriverol Dec 2, 2024
6d82424
remove ms2rescore.py
ypriverol Dec 2, 2024
2d6e634
remove ms2rescore.py
ypriverol Dec 2, 2024
a8ac9f4
remove ms2rescore.py
ypriverol Dec 2, 2024
d18b101
latest versions for all packages
ypriverol Dec 2, 2024
0ae008e
latest versions for all packages
ypriverol Dec 2, 2024
1b4188d
latest versions for all packages
ypriverol Dec 2, 2024
1c2d051
major update on tests
ypriverol Dec 2, 2024
af9b247
constants.py created.
ypriverol Dec 2, 2024
d8bc225
constants.py created.
ypriverol Dec 2, 2024
03ac3af
constants.py created.
ypriverol Dec 2, 2024
5b9ef64
constants.py created.
ypriverol Dec 2, 2024
b200964
constants.py created.
ypriverol Dec 2, 2024
422d9da
constants.py created.
ypriverol Dec 3, 2024
fe54ec5
constants.py created.
ypriverol Dec 3, 2024
1cf33b1
constants.py created.
ypriverol Dec 3, 2024
75a98e9
constants.py created.
ypriverol Dec 3, 2024
6b9a2b1
major changes in batch processing.
ypriverol Dec 3, 2024
9f54092
fix bug.
ypriverol Dec 3, 2024
f180cca
major changes in batch processing and remove ms2rescore.py
ypriverol Dec 1, 2024
2980bb6
Merge remote-tracking branch 'origin/dev' into dev
ypriverol Dec 3, 2024
8c4c822
minor cleaning tool.
ypriverol Dec 3, 2024
b5773c7
minor cleaning tool.
ypriverol Dec 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/conda-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
cache-downloads: true
auto-update-conda: false
activate-environment: test
python-version: "3.10"
python-version: "3.12"

- name: Setup conda-build and anaconda-client
run: |
Expand Down
9 changes: 1 addition & 8 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,4 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
poetry run pytest
- name: Download test data
run: |
wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML
wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML
- name: Test percolator ms2rescore
run: |
quantmsutilsc ms2rescore --psm_file TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML --spectrum_path TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML --processes 2 --ms2pip_model HCD2021 --feature_generators 'ms2pip,deeplc' --id_decoy_pattern ^rev --test_fdr 0.05
poetry run pytest
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,7 @@ cython_debug/
*_df.csv
*.tsv
/tests/test_data/hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d/
/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj1.mzML
/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj2.mzML
/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj1.mzML
/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj2.mzML
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ The following functionalities are available in the package:
- `openms2sample` - Extra sample information from OpenMS experimental design file. An example of OpenMS experimental design file is available [here](https://github.com/bigbio/quantms-utils/blob/dev/tests/test_data/BSA_design_urls.tsv).
- `checksamplesheet` - Check the sample sheet for errors and inconsistencies. The experimental design coult be an OpenMS experimental design file or and SDRF file.

### ms2rescore scripts

- `ms2rescore` - Rescore MS2 spectra using the MS2PIP model. The output is a mzML file with the rescored MS2 spectra.

### Features to percolator scripts

- `sage2feature` - The add_sage_feature function enhances an idXML file by appending additional features from a Sage feature table, excluding those generated by 'psm_file'.
Expand Down
12 changes: 3 additions & 9 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@ channels:
dependencies:
- click
- sdrf-pipelines>=0.0.31
- pyopenms>=2.4.0
- pyopenms>=3.2.0
- pandas
- numpy
- pyarrow
- ms2rescore=3.0.3
- deepLC=2.2.38
- psm-utils=0.8.3
- scipy=1.13.1
- pygam
- protobuf=3.19.6
- pyarrow>=16.1.0
- scipy
16 changes: 5 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "quantms-utils"
description = "Python scripts and helpers for the quantMS workflow"
readme = "README.md"
license = "MIT"
version = "0.0.16"
version = "0.0.17"
authors = [
"Yasset Perez-Riverol <[email protected]>",
"Dai Chengxin <[email protected]>",
Expand All @@ -29,19 +29,13 @@ packages = [
]

[tool.poetry.dependencies]
python = ">=3.8,<3.11"
python = "*"
click = "*"
sdrf-pipelines = ">=0.0.31"
pyopenms = ">=2.4.0"
ms2rescore = "3.0.3"
pyopenms = ">=3.2.0"
pandas = "*"
numpy = "*"
pyarrow = "*"
psm-utils = "0.8.3"
deepLC = "2.2.38"
scipy = "1.13.1"
pygam = "*"
protobuf= "3.19.6"
pyarrow = ">=16.1.0"
scipy = "*"

[tool.poetry.urls]
GitHub = "https://github.com/bigbio/quantms-utils"
Expand Down
2 changes: 1 addition & 1 deletion quantmsutils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.16"
__version__ = "0.0.17"
9 changes: 7 additions & 2 deletions quantmsutils/diann/diann2mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from pyopenms import AASequence, FASTAFile, ModificationsDB
from pyopenms.Constants import PROTON_MASS_U

from quantmsutils.utils.constants import MS_LEVEL, RETENTION_TIME, SCAN, EXPERIMENTAL_MASS_TO_CHARGE

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
Expand Down Expand Up @@ -1097,11 +1099,14 @@ def __find_info(directory, n):

file = __find_info(folder, n)
target = pd.read_parquet(file)
target = target[target["MSLevel"] == 2]

# Read original parquet columns from mzml_stats
target = target[target[MS_LEVEL] == 2]
target.reset_index(inplace=True, drop=True)
target["DIANN-intraID"] = target.index
group.sort_values(by="RT", inplace=True)
target = target[["Retention_Time", "SpectrumID", "DIANN-intraID", "Exp_Mass_To_Charge"]]
target = target[[RETENTION_TIME, SCAN, "DIANN-intraID", EXPERIMENTAL_MASS_TO_CHARGE]]

target.columns = [
"RT",
"opt_global_spectrum_reference",
Expand Down
133 changes: 61 additions & 72 deletions quantmsutils/mzml/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
import pyarrow.parquet as pq
from pyopenms import MzMLFile

from quantmsutils.utils.constants import CHARGE, SCAN, MS_LEVEL, NUM_PEAKS, BASE_PEAK_INTENSITY, \
SUMMED_PEAK_INTENSITY, RETENTION_TIME, EXPERIMENTAL_MASS_TO_CHARGE, ACQUISITION_DATETIME, MZ_ARRAY, \
INTENSITY_ARRAY, MONOISOTOPIC_MZ, MAX_INTENSITY


class BatchWritingConsumer:
"""
Expand All @@ -24,10 +28,12 @@ def __init__(
output_path,
batch_size=10000,
id_only=False,
id_output_path=None,
):
self.parquet_schema = parquet_schema
self.id_parquet_schema = id_parquet_schema
self.output_path = output_path
self.id_output_path = id_output_path
self.batch_size = batch_size
self.id_only = id_only
self.batch_data = []
Expand Down Expand Up @@ -69,46 +75,42 @@ def consumeSpectrum(self, spectrum):
if self.id_only:
scan_id = self.scan_pattern.findall(spectrum.getNativeID())[0]
self.psm_parts.append(
[
{
"scan": scan_id,
"ms_level": ms_level,
"mz": mz_array,
"intensity": intensity_array,
}
]
[str(scan_id),
int(ms_level),
mz_array.tolist(),
intensity_array.tolist(),]
)

row_data = {
"SpectrumID": spectrum.getNativeID(),
"MSLevel": float(ms_level),
"Charge": float(charge_state) if charge_state is not None else None,
"MS_peaks": float(peak_per_ms),
"Base_Peak_Intensity": (
SCAN: spectrum.getNativeID(),
MS_LEVEL: int(ms_level),
CHARGE: int(charge_state) if charge_state is not None else None,
NUM_PEAKS: int(peak_per_ms),
BASE_PEAK_INTENSITY: (
float(base_peak_intensity) if base_peak_intensity is not None else None
),
"Summed_Peak_Intensities": (
SUMMED_PEAK_INTENSITY: (
float(total_intensity) if total_intensity is not None else None
),
"Retention_Time": float(rt),
"Exp_Mass_To_Charge": float(exp_mz) if exp_mz is not None else None,
"AcquisitionDateTime": str(self.acquisition_datetime),
RETENTION_TIME: float(rt),
EXPERIMENTAL_MASS_TO_CHARGE: float(exp_mz) if exp_mz is not None else None,
ACQUISITION_DATETIME: str(self.acquisition_datetime),
}
elif ms_level == 1:
row_data = {
"SpectrumID": spectrum.getNativeID(),
"MSLevel": float(ms_level),
"Charge": None,
"MS_peaks": float(peak_per_ms),
"Base_Peak_Intensity": (
SCAN: spectrum.getNativeID(),
MS_LEVEL: int(ms_level),
CHARGE: None,
NUM_PEAKS: int(peak_per_ms),
BASE_PEAK_INTENSITY: (
float(base_peak_intensity) if base_peak_intensity is not None else None
),
"Summed_Peak_Intensities": (
SUMMED_PEAK_INTENSITY: (
float(total_intensity) if total_intensity is not None else None
),
"Retention_Time": float(rt),
"Exp_Mass_To_Charge": None,
"AcquisitionDateTime": str(self.acquisition_datetime),
RETENTION_TIME: float(rt),
EXPERIMENTAL_MASS_TO_CHARGE: None,
ACQUISITION_DATETIME: str(self.acquisition_datetime),
}
else:
return
Expand Down Expand Up @@ -139,7 +141,7 @@ def _write_batch(self):
where=self.output_path, schema=self.parquet_schema, compression="gzip"
)

# Create a RecordBatch directly from the current batch
# Create a Table directly from the current batch
batch = pa.RecordBatch.from_pylist(self.batch_data, schema=self.parquet_schema)

# Write the batch directly
Expand All @@ -148,22 +150,6 @@ def _write_batch(self):
# Clear the batch data
self.batch_data = []

# Handle ID-only data if applicable
if self.id_only and self.psm_parts:
# Similar approach for spectrum ID data
if self.id_parquet_writer is None:
self.id_parquet_writer = pq.ParquetWriter(
where=f"{Path(self.output_path).stem}_spectrum_df.parquet",
schema=self.id_parquet_schema,
compression="gzip",
)

id_batch = pa.RecordBatch.from_pylist(
self.psm_parts, schema=self.id_parquet_schema
)
self.id_parquet_writer.write_batch(id_batch)
self.psm_parts = []

except Exception as e:
print(f"Error during batch writing: {e}")
raise
Expand All @@ -176,16 +162,15 @@ def finalize(self):
if self.batch_data:
self._write_batch()

# Write spectrum data if id_only
if self.id_only and self.psm_parts:
self._write_batch()

if self.parquet_writer:
self.parquet_writer.close()

if self.id_parquet_writer:
self.id_parquet_writer.close()

if self.id_only and self.psm_parts:
#Todo: We have to find a way to do it in batches, as we do with normal mz data with batch writing
#Todo: the problem I found is that using pa.RecordBatch.from_pylist the list of lists is not supported.
df = pd.DataFrame(
self.psm_parts, columns=["scan", "ms_level", "mz", "intensity"])
df.to_parquet(self.id_output_path, index=False, engine="pyarrow", compression="gzip")

def column_exists(conn, table_name: str) -> List[str]:
"""
Expand Down Expand Up @@ -221,24 +206,24 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False, batch_size: int =
"""
schema = pa.schema(
[
pa.field("SpectrumID", pa.string(), nullable=True),
pa.field("MSLevel", pa.float64(), nullable=True),
pa.field("Charge", pa.float64(), nullable=True),
pa.field("MS_peaks", pa.float64(), nullable=True),
pa.field("Base_Peak_Intensity", pa.float64(), nullable=True),
pa.field("Summed_Peak_Intensities", pa.float64(), nullable=True),
pa.field("Retention_Time", pa.float64(), nullable=True),
pa.field("Exp_Mass_To_Charge", pa.float64(), nullable=True),
pa.field("AcquisitionDateTime", pa.string(), nullable=True),
pa.field(SCAN, pa.string(), nullable=True),
pa.field(MS_LEVEL, pa.int32(), nullable=True),
pa.field(CHARGE, pa.int32(), nullable=True),
pa.field(NUM_PEAKS, pa.int32(), nullable=True),
pa.field(BASE_PEAK_INTENSITY, pa.float64(), nullable=True),
pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
pa.field(RETENTION_TIME, pa.float64(), nullable=True),
pa.field(EXPERIMENTAL_MASS_TO_CHARGE, pa.float64(), nullable=True),
pa.field(ACQUISITION_DATETIME, pa.string(), nullable=True),
]
)

id_schema = pa.schema(
[
("scan", pa.string()),
("ms_level", pa.int32()),
("mz", pa.list_(pa.float64())),
("intensity", pa.list_(pa.float64())),
(SCAN, pa.string()),
(MS_LEVEL, pa.int32()),
(MZ_ARRAY, pa.list_(pa.float64())),
(INTENSITY_ARRAY, pa.list_(pa.float64())),
]
)

Expand All @@ -248,6 +233,7 @@ def batch_write_mzml_streaming(
output_path: str,
id_parquet_schema: pa.Schema,
id_only: bool = False,
id_output_path: str = None,
batch_size: int = 10000,
) -> Optional[str]:
"""
Expand All @@ -258,6 +244,7 @@ def batch_write_mzml_streaming(
output_path=output_path,
batch_size=batch_size,
id_only=id_only,
id_output_path=id_output_path,
id_parquet_schema=id_parquet_schema,
)
try:
Expand Down Expand Up @@ -308,15 +295,15 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100

schema = pa.schema(
[
pa.field("Id", pa.int32(), nullable=False),
pa.field("MsMsType", pa.int32(), nullable=True),
pa.field("NumPeaks", pa.int32(), nullable=True),
pa.field("MaxIntensity", pa.float64(), nullable=True),
pa.field("SummedIntensities", pa.float64(), nullable=True),
pa.field("Time", pa.float64(), nullable=True),
pa.field("Charge", pa.int32(), nullable=True),
pa.field("MonoisotopicMz", pa.float64(), nullable=True),
pa.field("AcquisitionDateTime", pa.string(), nullable=True),
pa.field(SCAN, pa.int32(), nullable=False),
pa.field(MS_LEVEL, pa.int32(), nullable=True),
pa.field(NUM_PEAKS, pa.int32(), nullable=True),
pa.field(MAX_INTENSITY, pa.float64(), nullable=True),
pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
pa.field(RETENTION_TIME, pa.float64(), nullable=True),
pa.field(CHARGE, pa.int32(), nullable=True),
pa.field(MONOISOTOPIC_MZ, pa.float64(), nullable=True),
pa.field(ACQUISITION_DATETIME, pa.string(), nullable=True),
]
)

Expand All @@ -341,6 +328,7 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
# Resolve file path
ms_path = _resolve_ms_path(ms_path)
output_path = f"{Path(ms_path).stem}_ms_info.parquet"
id_output_path = f"{Path(ms_path).stem}_spectrum_df.parquet"

# Choose processing method based on file type
if Path(ms_path).suffix == ".d":
Expand All @@ -352,6 +340,7 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
id_parquet_schema=id_schema,
output_path=output_path,
id_only=id_only,
id_output_path=id_output_path,
batch_size=batch_size,
)
else:
Expand Down
2 changes: 0 additions & 2 deletions quantmsutils/quantmsutilsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from quantmsutils.features.sage_feature import add_sage_feature
from quantmsutils.mzml.mzml_statistics import mzml_statistics
from quantmsutils.psm.psm_conversion import convert_psm
from quantmsutils.rescoring.ms2rescore import ms2rescore
from quantmsutils.sdrf.check_samplesheet import checksamplesheet
from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign
from quantmsutils.features.snr import spectrum2feature
Expand All @@ -28,7 +27,6 @@ def cli():
cli.add_command(mzml_statistics)
cli.add_command(extract_sample_from_expdesign)
cli.add_command(checksamplesheet)
cli.add_command(ms2rescore)
cli.add_command(convert_psm)
cli.add_command(spectrum2feature)

Expand Down
Empty file removed quantmsutils/rescoring/__init__.py
Empty file.
Loading
Loading