77import logging
88import re
99from typing import List , Tuple
10-
10+ from collections import defaultdict
1111import click
1212from sdrf_pipelines .openms .unimod import UnimodDatabase
1313
1414logging .basicConfig (format = "%(asctime)s [%(funcName)s] - %(message)s" , level = logging .DEBUG )
1515logger = logging .getLogger (__name__ )
1616
17+ # Lazy initialization of UnimodDatabase for improved testability.
18+ # The database is created on first access rather than at module import time,
19+ # which allows tests to mock or replace it more easily.
20+ _unimod_database = None
21+
22+
23+ def get_unimod_database ():
24+ """
25+ Get the UnimodDatabase instance, creating it lazily on first access.
26+
27+ This pattern improves testability by avoiding database initialization at module
28+ import time. For testing purposes, the internal _unimod_database variable can be
29+ set to None to force re-initialization on the next call.
30+
31+ :return: The UnimodDatabase instance.
32+ """
33+ global _unimod_database
34+ if _unimod_database is None :
35+ _unimod_database = UnimodDatabase ()
36+ return _unimod_database
37+
38+
39+ # Met-loss modification constant (UniMod:765) with mass shift and site specification
40+ MET_LOSS_MODIFICATION = "UniMod:765,-131.040485,*nM"
41+
1742
1843@click .command ("dianncfg" , short_help = "Create DIA-NN config file with enzyme and PTMs" )
1944@click .option ("--enzyme" , "-e" , help = "" )
@@ -32,8 +57,7 @@ def dianncfg(ctx, enzyme, fix_mod, var_mod):
3257 :param var_mod: A string of variable modifications, separated by commas.
3358 """
3459 cut = enzyme_cut (enzyme )
35- unimod_database = UnimodDatabase ()
36- fix_ptm , var_ptm = convert_mod (unimod_database , fix_mod , var_mod )
60+ fix_ptm , var_ptm = convert_mod (fix_mod , var_mod )
3761
3862 var_ptm_str = " --var-mod "
3963 fix_ptm_str = " --fixed-mod "
@@ -42,83 +66,106 @@ def dianncfg(ctx, enzyme, fix_mod, var_mod):
4266 for mod in fix_ptm :
4367 diann_fix_ptm += fix_ptm_str + mod
4468 for mod in var_ptm :
45- diann_var_ptm += var_ptm_str + mod
69+ if mod == MET_LOSS_MODIFICATION :
70+ diann_var_ptm += " --met-excision "
71+ else :
72+ diann_var_ptm += var_ptm_str + mod
4673
4774 with open ("diann_config.cfg" , "w" ) as file :
4875 file .write ("--cut " + cut + diann_fix_ptm + diann_var_ptm )
4976
5077
51- def convert_mod (unimod_database , fix_mod : str , var_mod : str ) -> Tuple [List , List ]:
78+ def get_mod (mod , mod_type ):
79+ """
80+ Retrieve and format a modification from the Unimod database for DIA-NN compatibility.
81+
82+ :param mod: The modification string, typically containing the modification name and site.
83+ :param mod_type: The type of modification ('fixed_mod' or 'var_mod').
84+ :return: A tuple (diann_mod_accession, site), where diann_mod_accession is a formatted string
85+ for DIA-NN and site is the modification site.
86+ :raises SystemExit: If the modification is not found in the Unimod database, logs an error and exits.
87+ """
5288 pattern = re .compile (r"\((.*?)\)" )
89+ modification_found = 0
90+ diann_mod_accession = None
91+ diann_mod_name = None
92+ for modification in get_unimod_database ().modifications :
93+ if modification .get_name () == mod .split (" " )[0 ]:
94+ diann_mod_accession = modification .get_accession ().replace ("UNIMOD:" , "UniMod:" ) + "," + str (modification ._delta_mono_mass )
95+ diann_mod_name = modification .get_name ()
96+ modification_found = 1
97+ break
98+
99+ if modification_found == 0 :
100+ logging .error (
101+ f"Only Unimod modifications are currently supported for the DIA pipeline. Unsupported modification: { mod } "
102+ )
103+ exit (1 )
104+
105+ # TODO support DIA multiplex
106+ if (
107+ "TMT" in diann_mod_name
108+ or "Label:" in diann_mod_name
109+ or "iTRAQ" in diann_mod_name
110+ or "mTRAQ" in diann_mod_name
111+ or "Dimethyl:" in diann_mod_name
112+ ):
113+ logging .error (
114+ "quantms DIA-NN workflow only supports LFQ now! Unsupported modifications: "
115+ + mod
116+ )
117+ exit (1 )
118+
119+ sites = re .findall (pattern , " " .join (mod .split (" " )[1 :]))
120+ if not sites :
121+ logging .error (
122+ f"No site specification found in modification string: { mod } "
123+ )
124+ exit (1 )
125+ site = sites [0 ]
126+ if site == "Protein N-term" :
127+ site = "*n"
128+ elif site == "N-term" :
129+ site = "n"
130+ elif len (site .split (" " )) >= 2 :
131+ pp = " " .join (site .split (" " )[:- 1 ])
132+ if pp == "Protein N-term" :
133+ pp = "*n"
134+ elif pp == "N-term" :
135+ pp = "n"
136+ aa = site .split (" " )[- 1 ]
137+ site = pp + aa
138+ if site == "*nM" and diann_mod_name == "Met-loss" and mod_type == "var_mod" :
139+ return diann_mod_accession , site
140+ else :
141+ logging .error ("Restricting to certain terminal AAs isn't directly supported. Please see https://github.com/vdemichev/DiaNN/issues/1791" )
142+ exit (1 )
143+ return diann_mod_accession , site
144+
145+
146+ def convert_mod (fix_mod : str , var_mod : str ) -> Tuple [List , List ]:
53147 var_ptm = []
54148 fix_ptm = []
55-
56- if fix_mod != "" :
149+ if fix_mod :
150+ merged = defaultdict ( list )
57151 for mod in fix_mod .split ("," ):
58- tag = 0
59- diann_mod = None
60- for modification in unimod_database .modifications :
61- if modification .get_name () == mod .split (" " )[0 ]:
62- diann_mod = modification .get_name () + "," + str (modification ._delta_mono_mass )
63- tag = 1
64- break
65- if tag == 0 :
66- logging .info (
67- "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
68- + mod
69- )
70- continue
71- site = re .findall (pattern , " " .join (mod .split (" " )[1 :]))[0 ]
72- if site == "Protein N-term" :
73- site = "*n"
74- elif site == "N-term" :
75- site = "n"
76-
77- if (
78- "TMT" in diann_mod
79- or "Label" in diann_mod
80- or "iTRAQ" in diann_mod
81- or "mTRAQ" in diann_mod
82- ):
83- fix_ptm .append (diann_mod + "," + site + "," + "label" )
84- elif diann_mod is not None :
85- fix_ptm .append (diann_mod + "," + site )
86- else :
87- print (
88- "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
89- + mod
90- )
91-
92- if var_mod != "" :
152+ diann_mod , site = get_mod (mod , "fixed_mod" )
153+ merged [diann_mod ].append (site )
154+
155+ # merge same modification for different sites
156+ for name , site_list in merged .items ():
157+ site_str = "" .join (sorted (set (site_list )))
158+ fix_ptm .append (f"{ name } ,{ site_str } " )
159+
160+ if var_mod :
161+ merged = defaultdict (list )
93162 for mod in var_mod .split ("," ):
94- tag = 0
95- diann_mod = None
96- for modification in unimod_database .modifications :
97- if modification .get_name () == mod .split (" " )[0 ]:
98- diann_mod = modification .get_name () + "," + str (modification ._delta_mono_mass )
99- tag = 1
100- break
101- if tag == 0 :
102- print (
103- "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
104- + mod
105- )
106- continue
107- site = re .findall (pattern , " " .join (mod .split (" " )[1 :]))[0 ]
108- if site == "Protein N-term" :
109- site = "*n"
110- elif site == "N-term" :
111- site = "n"
112-
113- if (
114- "TMT" in diann_mod
115- or "Label" in diann_mod
116- or "iTRAQ" in diann_mod
117- or "mTRAQ" in diann_mod
118- ):
119- var_ptm .append (diann_mod + "," + site + "," + "label" )
120- else :
121- var_ptm .append (diann_mod + "," + site )
163+ diann_mod , site = get_mod (mod , "var_mod" )
164+ merged [diann_mod ].append (site )
165+ # merge same modification for different sites
166+ for name , site_list in merged .items ():
167+ site_str = "" .join (sorted (set (site_list )))
168+ var_ptm .append (f"{ name } ,{ site_str } " )
122169
123170 return fix_ptm , var_ptm
124171
0 commit comments