bigbio · ypriverol · Dec 3, 2024 · Dec 1, 2024 · Dec 1, 2024 · Dec 1, 2024
diff --git a/.github/workflows/conda-build.yml b/.github/workflows/conda-build.yml
@@ -20,7 +20,7 @@ jobs:
         cache-downloads: true
         auto-update-conda: false
         activate-environment: test
-        python-version: "3.10"
+        python-version: "3.12"
 
     - name: Setup conda-build and anaconda-client
       run: |

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -39,11 +39,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        poetry run pytest
-    - name: Download test data
-      run: |
-        wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML
-        wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML
-    - name: Test percolator ms2rescore
-      run: |
-        quantmsutilsc ms2rescore --psm_file TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML --spectrum_path TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML --processes 2 --ms2pip_model HCD2021 --feature_generators 'ms2pip,deeplc' --id_decoy_pattern ^rev --test_fdr 0.05
+        poetry run pytest
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,7 @@ cython_debug/
 *_df.csv
 *.tsv
 /tests/test_data/hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d/
+/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj1.mzML
+/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj2.mzML
+/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj1.mzML
+/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj2.mzML
diff --git a/README.md b/README.md
@@ -27,10 +27,6 @@ The following functionalities are available in the package:
 - `openms2sample` - Extra sample information from OpenMS experimental design file. An example of OpenMS experimental design file is available [here](https://github.com/bigbio/quantms-utils/blob/dev/tests/test_data/BSA_design_urls.tsv).
 - `checksamplesheet` - Check the sample sheet for errors and inconsistencies. The experimental design coult be an OpenMS experimental design file or and SDRF file. 
 
-### ms2rescore scripts
-
-- `ms2rescore` - Rescore MS2 spectra using the MS2PIP model. The output is a mzML file with the rescored MS2 spectra.
-
 ### Features to percolator scripts
 
 - `sage2feature` - The add_sage_feature function enhances an idXML file by appending additional features from a Sage feature table, excluding those generated by 'psm_file'.

diff --git a/environment.yml b/environment.yml
@@ -7,13 +7,7 @@ channels:
 dependencies:
   - click
   - sdrf-pipelines>=0.0.31
-  - pyopenms>=2.4.0
+  - pyopenms>=3.2.0
   - pandas
-  - numpy
-  - pyarrow
-  - ms2rescore=3.0.3
-  - deepLC=2.2.38
-  - psm-utils=0.8.3
-  - scipy=1.13.1
-  - pygam
-  - protobuf=3.19.6
+  - pyarrow>=16.1.0
+  - scipy
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "quantms-utils"
 description = "Python scripts and helpers for the quantMS workflow"
 readme = "README.md"
 license = "MIT"
-version = "0.0.16"
+version = "0.0.17"
 authors = [
     "Yasset Perez-Riverol <[email protected]>",
     "Dai Chengxin <[email protected]>",
@@ -29,19 +29,13 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.8,<3.11"
+python = "*"
 click = "*"
 sdrf-pipelines = ">=0.0.31"
-pyopenms = ">=2.4.0"
-ms2rescore = "3.0.3"
+pyopenms = ">=3.2.0"
 pandas = "*"
-numpy = "*"
-pyarrow = "*"
-psm-utils = "0.8.3"
-deepLC = "2.2.38"
-scipy = "1.13.1"
-pygam = "*"
-protobuf= "3.19.6"
+pyarrow = ">=16.1.0"
+scipy = "*"
 
 [tool.poetry.urls]
 GitHub = "https://github.com/bigbio/quantms-utils"

diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.16"
+__version__ = "0.0.17"
diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
@@ -19,6 +19,8 @@
 from pyopenms import AASequence, FASTAFile, ModificationsDB
 from pyopenms.Constants import PROTON_MASS_U
 
+from quantmsutils.utils.constants import MS_LEVEL, RETENTION_TIME, SCAN, EXPERIMENTAL_MASS_TO_CHARGE
+
 pd.set_option("display.max_rows", 500)
 pd.set_option("display.max_columns", 500)
 pd.set_option("display.width", 1000)
@@ -1097,11 +1099,14 @@ def __find_info(directory, n):
 
         file = __find_info(folder, n)
         target = pd.read_parquet(file)
-        target = target[target["MSLevel"] == 2]
+
+        # Read original parquet columns from mzml_stats
+        target = target[target[MS_LEVEL] == 2]
         target.reset_index(inplace=True, drop=True)
         target["DIANN-intraID"] = target.index
         group.sort_values(by="RT", inplace=True)
-        target = target[["Retention_Time", "SpectrumID", "DIANN-intraID", "Exp_Mass_To_Charge"]]
+        target = target[[RETENTION_TIME, SCAN, "DIANN-intraID", EXPERIMENTAL_MASS_TO_CHARGE]]
+
         target.columns = [
             "RT",
             "opt_global_spectrum_reference",

diff --git a/quantmsutils/mzml/mzml_statistics.py b/quantmsutils/mzml/mzml_statistics.py
@@ -10,6 +10,10 @@
 import pyarrow.parquet as pq
 from pyopenms import MzMLFile
 
+from quantmsutils.utils.constants import CHARGE, SCAN, MS_LEVEL, NUM_PEAKS, BASE_PEAK_INTENSITY, \
+    SUMMED_PEAK_INTENSITY, RETENTION_TIME, EXPERIMENTAL_MASS_TO_CHARGE, ACQUISITION_DATETIME, MZ_ARRAY, \
+    INTENSITY_ARRAY, MONOISOTOPIC_MZ, MAX_INTENSITY
+
 
 class BatchWritingConsumer:
     """
@@ -24,10 +28,12 @@ def __init__(
         output_path,
         batch_size=10000,
         id_only=False,
+        id_output_path=None,
     ):
         self.parquet_schema = parquet_schema
         self.id_parquet_schema = id_parquet_schema
         self.output_path = output_path
+        self.id_output_path = id_output_path
         self.batch_size = batch_size
         self.id_only = id_only
         self.batch_data = []
@@ -69,46 +75,42 @@ def consumeSpectrum(self, spectrum):
             if self.id_only:
                 scan_id = self.scan_pattern.findall(spectrum.getNativeID())[0]
                 self.psm_parts.append(
-                    [
-                        {
-                            "scan": scan_id,
-                            "ms_level": ms_level,
-                            "mz": mz_array,
-                            "intensity": intensity_array,
-                        }
-                    ]
+                    [str(scan_id),
+                            int(ms_level),
+                            mz_array.tolist(),
+                            intensity_array.tolist(),]
                 )
 
             row_data = {
-                "SpectrumID": spectrum.getNativeID(),
-                "MSLevel": float(ms_level),
-                "Charge": float(charge_state) if charge_state is not None else None,
-                "MS_peaks": float(peak_per_ms),
-                "Base_Peak_Intensity": (
+                SCAN: spectrum.getNativeID(),
+                MS_LEVEL: int(ms_level),
+                CHARGE: int(charge_state) if charge_state is not None else None,
+                NUM_PEAKS: int(peak_per_ms),
+                BASE_PEAK_INTENSITY: (
                     float(base_peak_intensity) if base_peak_intensity is not None else None
                 ),
-                "Summed_Peak_Intensities": (
+                SUMMED_PEAK_INTENSITY: (
                     float(total_intensity) if total_intensity is not None else None
                 ),
-                "Retention_Time": float(rt),
-                "Exp_Mass_To_Charge": float(exp_mz) if exp_mz is not None else None,
-                "AcquisitionDateTime": str(self.acquisition_datetime),
+                RETENTION_TIME: float(rt),
+                EXPERIMENTAL_MASS_TO_CHARGE: float(exp_mz) if exp_mz is not None else None,
+                ACQUISITION_DATETIME: str(self.acquisition_datetime),
             }
         elif ms_level == 1:
             row_data = {
-                "SpectrumID": spectrum.getNativeID(),
-                "MSLevel": float(ms_level),
-                "Charge": None,
-                "MS_peaks": float(peak_per_ms),
-                "Base_Peak_Intensity": (
+                SCAN: spectrum.getNativeID(),
+                MS_LEVEL: int(ms_level),
+                CHARGE: None,
+                NUM_PEAKS: int(peak_per_ms),
+                BASE_PEAK_INTENSITY: (
                     float(base_peak_intensity) if base_peak_intensity is not None else None
                 ),
-                "Summed_Peak_Intensities": (
+                SUMMED_PEAK_INTENSITY: (
                     float(total_intensity) if total_intensity is not None else None
                 ),
-                "Retention_Time": float(rt),
-                "Exp_Mass_To_Charge": None,
-                "AcquisitionDateTime": str(self.acquisition_datetime),
+                RETENTION_TIME: float(rt),
+                EXPERIMENTAL_MASS_TO_CHARGE: None,
+                ACQUISITION_DATETIME: str(self.acquisition_datetime),
             }
         else:
             return
@@ -139,7 +141,7 @@ def _write_batch(self):
                     where=self.output_path, schema=self.parquet_schema, compression="gzip"
                 )
 
-            # Create a RecordBatch directly from the current batch
+            # Create a Table directly from the current batch
             batch = pa.RecordBatch.from_pylist(self.batch_data, schema=self.parquet_schema)
 
             # Write the batch directly
@@ -148,22 +150,6 @@ def _write_batch(self):
             # Clear the batch data
             self.batch_data = []
 
-            # Handle ID-only data if applicable
-            if self.id_only and self.psm_parts:
-                # Similar approach for spectrum ID data
-                if self.id_parquet_writer is None:
-                    self.id_parquet_writer = pq.ParquetWriter(
-                        where=f"{Path(self.output_path).stem}_spectrum_df.parquet",
-                        schema=self.id_parquet_schema,
-                        compression="gzip",
-                    )
-
-                id_batch = pa.RecordBatch.from_pylist(
-                    self.psm_parts, schema=self.id_parquet_schema
-                )
-                self.id_parquet_writer.write_batch(id_batch)
-                self.psm_parts = []
-
         except Exception as e:
             print(f"Error during batch writing: {e}")
             raise
@@ -176,16 +162,15 @@ def finalize(self):
         if self.batch_data:
             self._write_batch()
 
-        # Write spectrum data if id_only
-        if self.id_only and self.psm_parts:
-            self._write_batch()
-
         if self.parquet_writer:
             self.parquet_writer.close()
 
-        if self.id_parquet_writer:
-            self.id_parquet_writer.close()
-
+        if self.id_only and self.psm_parts:
+            #Todo: We have to find a way to do it in batches, as we do with normal mz data with batch writing
+            #Todo: the problem I found is that using pa.RecordBatch.from_pylist the list of lists is not supported.
+            df = pd.DataFrame(
+                self.psm_parts, columns=["scan", "ms_level", "mz", "intensity"])
+            df.to_parquet(self.id_output_path, index=False, engine="pyarrow", compression="gzip")
 
 def column_exists(conn, table_name: str) -> List[str]:
     """
@@ -221,24 +206,24 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False, batch_size: int =
     """
     schema = pa.schema(
         [
-            pa.field("SpectrumID", pa.string(), nullable=True),
-            pa.field("MSLevel", pa.float64(), nullable=True),
-            pa.field("Charge", pa.float64(), nullable=True),
-            pa.field("MS_peaks", pa.float64(), nullable=True),
-            pa.field("Base_Peak_Intensity", pa.float64(), nullable=True),
-            pa.field("Summed_Peak_Intensities", pa.float64(), nullable=True),
-            pa.field("Retention_Time", pa.float64(), nullable=True),
-            pa.field("Exp_Mass_To_Charge", pa.float64(), nullable=True),
-            pa.field("AcquisitionDateTime", pa.string(), nullable=True),
+            pa.field(SCAN, pa.string(), nullable=True),
+            pa.field(MS_LEVEL, pa.int32(), nullable=True),
+            pa.field(CHARGE, pa.int32(), nullable=True),
+            pa.field(NUM_PEAKS, pa.int32(), nullable=True),
+            pa.field(BASE_PEAK_INTENSITY, pa.float64(), nullable=True),
+            pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
+            pa.field(RETENTION_TIME, pa.float64(), nullable=True),
+            pa.field(EXPERIMENTAL_MASS_TO_CHARGE, pa.float64(), nullable=True),
+            pa.field(ACQUISITION_DATETIME, pa.string(), nullable=True),
         ]
     )
 
     id_schema = pa.schema(
         [
-            ("scan", pa.string()),
-            ("ms_level", pa.int32()),
-            ("mz", pa.list_(pa.float64())),
-            ("intensity", pa.list_(pa.float64())),
+            (SCAN, pa.string()),
+            (MS_LEVEL, pa.int32()),
+            (MZ_ARRAY, pa.list_(pa.float64())),
+            (INTENSITY_ARRAY, pa.list_(pa.float64())),
         ]
     )
 
@@ -248,6 +233,7 @@ def batch_write_mzml_streaming(
         output_path: str,
         id_parquet_schema: pa.Schema,
         id_only: bool = False,
+        id_output_path: str = None,
         batch_size: int = 10000,
     ) -> Optional[str]:
         """
@@ -258,6 +244,7 @@ def batch_write_mzml_streaming(
             output_path=output_path,
             batch_size=batch_size,
             id_only=id_only,
+            id_output_path=id_output_path,
             id_parquet_schema=id_parquet_schema,
         )
         try:
@@ -308,15 +295,15 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
 
             schema = pa.schema(
                 [
-                    pa.field("Id", pa.int32(), nullable=False),
-                    pa.field("MsMsType", pa.int32(), nullable=True),
-                    pa.field("NumPeaks", pa.int32(), nullable=True),
-                    pa.field("MaxIntensity", pa.float64(), nullable=True),
-                    pa.field("SummedIntensities", pa.float64(), nullable=True),
-                    pa.field("Time", pa.float64(), nullable=True),
-                    pa.field("Charge", pa.int32(), nullable=True),
-                    pa.field("MonoisotopicMz", pa.float64(), nullable=True),
-                    pa.field("AcquisitionDateTime", pa.string(), nullable=True),
+                    pa.field(SCAN, pa.int32(), nullable=False),
+                    pa.field(MS_LEVEL, pa.int32(), nullable=True),
+                    pa.field(NUM_PEAKS, pa.int32(), nullable=True),
+                    pa.field(MAX_INTENSITY, pa.float64(), nullable=True),
+                    pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
+                    pa.field(RETENTION_TIME, pa.float64(), nullable=True),
+                    pa.field(CHARGE, pa.int32(), nullable=True),
+                    pa.field(MONOISOTOPIC_MZ, pa.float64(), nullable=True),
+                    pa.field(ACQUISITION_DATETIME, pa.string(), nullable=True),
                 ]
             )
 
@@ -341,6 +328,7 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
     # Resolve file path
     ms_path = _resolve_ms_path(ms_path)
     output_path = f"{Path(ms_path).stem}_ms_info.parquet"
+    id_output_path = f"{Path(ms_path).stem}_spectrum_df.parquet"
 
     # Choose processing method based on file type
     if Path(ms_path).suffix == ".d":
@@ -352,6 +340,7 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
             id_parquet_schema=id_schema,
             output_path=output_path,
             id_only=id_only,
+            id_output_path=id_output_path,
             batch_size=batch_size,
         )
     else:

diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py
@@ -5,7 +5,6 @@
 from quantmsutils.features.sage_feature import add_sage_feature
 from quantmsutils.mzml.mzml_statistics import mzml_statistics
 from quantmsutils.psm.psm_conversion import convert_psm
-from quantmsutils.rescoring.ms2rescore import ms2rescore
 from quantmsutils.sdrf.check_samplesheet import checksamplesheet
 from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign
 from quantmsutils.features.snr import spectrum2feature
@@ -28,7 +27,6 @@ def cli():
 cli.add_command(mzml_statistics)
 cli.add_command(extract_sample_from_expdesign)
 cli.add_command(checksamplesheet)
-cli.add_command(ms2rescore)
 cli.add_command(convert_psm)
 cli.add_command(spectrum2feature)
 

diff --git a/quantmsutils/rescoring/__init__.py b/quantmsutils/rescoring/__init__.py