From fa74a6e26c9b61abcc6632431f1d106821e522fb Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Fri, 28 Jun 2024 15:03:39 +0800 Subject: [PATCH 1/9] =?UTF-8?q?=E4=BC=98=E5=8C=96merge=20segments=20?= =?UTF-8?q?=E5=8F=82=E6=95=B0=EF=BC=8C=E8=A7=A3=E5=86=B3=E6=96=B0=E9=97=BB?= =?UTF-8?q?=E8=81=94=E6=92=AD=E7=94=B7=E5=A5=B3=E4=B8=BB=E6=8C=81=E4=BA=BA?= =?UTF-8?q?=E2=80=9C=E6=99=9A=E4=B8=8A=E5=A5=BD=E2=80=9D=E5=90=88=E5=B9=B6?= =?UTF-8?q?=E4=B8=80=E4=B8=AAspeakid=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/models/campplus/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/campplus/utils.py b/funasr/models/campplus/utils.py index cb8d4b31a..94b4952de 100644 --- a/funasr/models/campplus/utils.py +++ b/funasr/models/campplus/utils.py @@ -169,7 +169,7 @@ def merge_seque(distribute_res): return res -def smooth(res, mindur=1): +def smooth(res, mindur=0.7): # if only one segment, return directly if len(res) < 2: return res From b1836414b541e78bbed801784dfd7b6eb5aca244 Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Fri, 28 Jun 2024 15:52:52 +0800 Subject: [PATCH 2/9] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=96=AD=E5=8F=A5?= =?UTF-8?q?=E4=B9=8B=E9=97=B4=E6=97=B6=E9=97=B4=E6=88=B3bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/utils/timestamp_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py index 6abebe165..4fc2d15c5 100644 --- a/funasr/utils/timestamp_tools.py +++ b/funasr/utils/timestamp_tools.py @@ -156,6 +156,7 @@ def timestamp_sentence( punc_id = int(punc_id) if punc_id is not None else 1 sentence_end = timestamp[1] if timestamp is not None else sentence_end + sentence_start = timestamp[0] if timestamp is not None else sentence_start sentence_text_seg = ( sentence_text_seg[:-1] if sentence_text_seg[-1] == " " else sentence_text_seg ) @@ -183,7 +184,6 @@ def timestamp_sentence( sentence_text = "" sentence_text_seg = "" ts_list = [] - sentence_start = sentence_end return res From b31592acd7831401b9b26de0b277fd53112f4ffd Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Wed, 3 Jul 2024 11:33:21 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E4=BC=98=E5=8C=96speakid=E5=92=8C=E8=AF=AD?= =?UTF-8?q?=E5=8F=A5=E5=8C=B9=E9=85=8D=E9=80=BB=E8=BE=91=EF=BC=8C=E9=83=A8?= =?UTF-8?q?=E5=88=86=E8=A7=A3=E5=86=B3speakid=E4=B8=8D=E4=BB=8E0=E9=80=92?= =?UTF-8?q?=E5=A2=9E=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/models/campplus/utils.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/funasr/models/campplus/utils.py b/funasr/models/campplus/utils.py index 94b4952de..4addd0a8a 100644 --- a/funasr/models/campplus/utils.py +++ b/funasr/models/campplus/utils.py @@ -193,23 +193,21 @@ def smooth(res, mindur=0.7): def distribute_spk(sentence_list, sd_time_list): - sd_sentence_list = [] + sd_time_list = [(spk_st * 1000, spk_ed * 1000, spk) for spk_st, spk_ed, spk in sd_time_list] for d in sentence_list: - sentence_start = d["start"] - sentence_end = d["end"] + sentence_start = d['start'] + sentence_end = d['end'] sentence_spk = 0 max_overlap = 0 - for sd_time in sd_time_list: - spk_st, spk_ed, spk = sd_time - spk_st = spk_st * 1000 - spk_ed = spk_ed * 1000 + for spk_st, spk_ed, spk in sd_time_list: overlap = max(min(sentence_end, spk_ed) - max(sentence_start, spk_st), 0) if overlap > max_overlap: max_overlap = overlap sentence_spk = spk - d["spk"] = int(sentence_spk) - sd_sentence_list.append(d) - return sd_sentence_list + if overlap > 0 and sentence_spk == spk: + max_overlap += overlap + d['spk'] = int(sentence_spk) + return sentence_list class Storage(metaclass=ABCMeta): From 0528806aa7f9bfd10378eae5c86f81aa2db795f9 Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Wed, 3 Jul 2024 11:41:17 +0800 Subject: [PATCH 4/9] =?UTF-8?q?Revert=20"=E4=BC=98=E5=8C=96speakid?= =?UTF-8?q?=E5=92=8C=E8=AF=AD=E5=8F=A5=E5=8C=B9=E9=85=8D=E9=80=BB=E8=BE=91?= =?UTF-8?q?=EF=BC=8C=E9=83=A8=E5=88=86=E8=A7=A3=E5=86=B3speakid=E4=B8=8D?= =?UTF-8?q?=E4=BB=8E0=E9=80=92=E5=A2=9E=E9=97=AE=E9=A2=98"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b31592acd7831401b9b26de0b277fd53112f4ffd. --- funasr/models/campplus/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/funasr/models/campplus/utils.py b/funasr/models/campplus/utils.py index 4addd0a8a..94b4952de 100644 --- a/funasr/models/campplus/utils.py +++ b/funasr/models/campplus/utils.py @@ -193,21 +193,23 @@ def smooth(res, mindur=0.7): def distribute_spk(sentence_list, sd_time_list): - sd_time_list = [(spk_st * 1000, spk_ed * 1000, spk) for spk_st, spk_ed, spk in sd_time_list] + sd_sentence_list = [] for d in sentence_list: - sentence_start = d['start'] - sentence_end = d['end'] + sentence_start = d["start"] + sentence_end = d["end"] sentence_spk = 0 max_overlap = 0 - for spk_st, spk_ed, spk in sd_time_list: + for sd_time in sd_time_list: + spk_st, spk_ed, spk = sd_time + spk_st = spk_st * 1000 + spk_ed = spk_ed * 1000 overlap = max(min(sentence_end, spk_ed) - max(sentence_start, spk_st), 0) if overlap > max_overlap: max_overlap = overlap sentence_spk = spk - if overlap > 0 and sentence_spk == spk: - max_overlap += overlap - d['spk'] = int(sentence_spk) - return sentence_list + d["spk"] = int(sentence_spk) + sd_sentence_list.append(d) + return sd_sentence_list class Storage(metaclass=ABCMeta): From be015ec75dcb021961c2014e20ea5c84e36cf610 Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Wed, 3 Jul 2024 11:42:01 +0800 Subject: [PATCH 5/9] =?UTF-8?q?=E4=BC=98=E5=8C=96speakid=E5=92=8C=E8=AF=AD?= =?UTF-8?q?=E5=8F=A5=E5=8C=B9=E9=85=8D=E9=80=BB=E8=BE=91=EF=BC=8C=E9=83=A8?= =?UTF-8?q?=E5=88=86=E8=A7=A3=E5=86=B3speakid=E4=B8=8D=E4=BB=8E0=E9=80=92?= =?UTF-8?q?=E5=A2=9E=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/models/campplus/utils.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/funasr/models/campplus/utils.py b/funasr/models/campplus/utils.py index 94b4952de..4addd0a8a 100644 --- a/funasr/models/campplus/utils.py +++ b/funasr/models/campplus/utils.py @@ -193,23 +193,21 @@ def smooth(res, mindur=0.7): def distribute_spk(sentence_list, sd_time_list): - sd_sentence_list = [] + sd_time_list = [(spk_st * 1000, spk_ed * 1000, spk) for spk_st, spk_ed, spk in sd_time_list] for d in sentence_list: - sentence_start = d["start"] - sentence_end = d["end"] + sentence_start = d['start'] + sentence_end = d['end'] sentence_spk = 0 max_overlap = 0 - for sd_time in sd_time_list: - spk_st, spk_ed, spk = sd_time - spk_st = spk_st * 1000 - spk_ed = spk_ed * 1000 + for spk_st, spk_ed, spk in sd_time_list: overlap = max(min(sentence_end, spk_ed) - max(sentence_start, spk_st), 0) if overlap > max_overlap: max_overlap = overlap sentence_spk = spk - d["spk"] = int(sentence_spk) - sd_sentence_list.append(d) - return sd_sentence_list + if overlap > 0 and sentence_spk == spk: + max_overlap += overlap + d['spk'] = int(sentence_spk) + return sentence_list class Storage(metaclass=ABCMeta): From b407b4c34538a778f3314f261eedd4a72d268fef Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Wed, 7 Aug 2024 16:05:21 +0800 Subject: [PATCH 6/9] =?UTF-8?q?=E8=AF=B4=E8=AF=9D=E4=BA=BA=E6=97=A5?= =?UTF-8?q?=E5=BF=97pipline=E5=A2=9E=E5=8A=A0=E6=83=85=E7=BB=AA=E8=AF=86?= =?UTF-8?q?=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/auto/auto_model.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index f735f1865..e2c21d487 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -160,7 +160,11 @@ def __init__(self, **kwargs): if spk_mode not in ["default", "vad_segment", "punc_segment"]: logging.error("spk_mode should be one of default, vad_segment and punc_segment.") self.spk_mode = spk_mode - + ser_model = kwargs.get("ser_model", None) + ser_kwargs = {} if kwargs.get("ser_kwargs", {}) is None else kwargs.get("ser_kwargs", {}) + if ser_model is not None: + logging.info("Building SER model.") + ser_model, ser_kwargs = self.build_model(**ser_kwargs) self.kwargs = kwargs self.model = model self.vad_model = vad_model @@ -169,6 +173,8 @@ def __init__(self, **kwargs): self.punc_kwargs = punc_kwargs self.spk_model = spk_model self.spk_kwargs = spk_kwargs + self.ser_model = ser_model + self.ser_kwargs = ser_kwargs self.model_path = kwargs.get("model_path") @staticmethod @@ -437,6 +443,11 @@ def inference_with_vad(self, input, input_len=None, **cfg): speech_b, input_len=None, model=self.spk_model, kwargs=kwargs, **cfg ) results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"] + if self.ser_model is not None: + ser_res = self.inference(speech_b, input_len=None, model=self.ser_model, kwargs=self.ser_kwargs, **cfg) + if "SenseVoiceSmall" in kwargs.get("ser_model", None): + results[_b]["ser_type"] = [i['text'].split("|><|")[1] for i in ser_res] + beg_idx = end_idx end_idx += 1 max_len_in_batch = sample_length @@ -529,6 +540,7 @@ def inference_with_vad(self, input, input_len=None, **cfg): "end": vadsegment[1], "sentence": rest["text"], "timestamp": rest["timestamp"], + "emotion": rest["ser_type"], } ) elif self.spk_mode == "punc_segment": @@ -552,6 +564,9 @@ def inference_with_vad(self, input, input_len=None, **cfg): raw_text, return_raw_text=return_raw_text, ) + if len(sentence_list) == len(result["ser_type"]): + for i in range(len(sentence_list)): + sentence_list[i]["emotion"] = result["ser_type"][i] distribute_spk(sentence_list, sv_output) result["sentence_info"] = sentence_list elif kwargs.get("sentence_timestamp", False): @@ -575,6 +590,8 @@ def inference_with_vad(self, input, input_len=None, **cfg): result["sentence_info"] = sentence_list if "spk_embedding" in result: del result["spk_embedding"] + if "ser_type" in result: + del result["ser_type"] result["key"] = key results_ret_list.append(result) From 3ad0599437e0f6bf5d85d4cc54301027fb01e15f Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Fri, 9 Aug 2024 13:43:59 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=83=85=E7=BB=AA?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E5=90=8E=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/auto/auto_model.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index e2c21d487..11399d286 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -107,6 +107,24 @@ def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): return key_list, data_list +def distribute_emotion(sentence_list, ser_time_list): + ser_time_list = [(st * 1000, ed * 1000, emotion) for st, ed, emotion in ser_time_list] + for d in sentence_list: + sentence_start = d['start'] + sentence_end = d['end'] + sentence_emotion = "EMO_UNKNOWN" + max_overlap = 0 + for st, ed, emotion in ser_time_list: + overlap = max(min(sentence_end, ed) - max(sentence_start, st), 0) + if overlap > max_overlap: + max_overlap = overlap + sentence_emotion = emotion + if overlap > 0 and sentence_emotion == emotion: + max_overlap += overlap + d['emotion'] = sentence_emotion + return sentence_list + + class AutoModel: def __init__(self, **kwargs): @@ -567,6 +585,9 @@ def inference_with_vad(self, input, input_len=None, **cfg): if len(sentence_list) == len(result["ser_type"]): for i in range(len(sentence_list)): sentence_list[i]["emotion"] = result["ser_type"][i] + else: + merged_list = [[x[0], x[1], y] for x, y in zip(all_segments, result["ser_type"])] + distribute_emotion(sentence_list, merged_list) distribute_spk(sentence_list, sv_output) result["sentence_info"] = sentence_list elif kwargs.get("sentence_timestamp", False): From 9b423d3d6a2ac8b10955221081a6cce6b8947cdb Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Mon, 12 Aug 2024 11:24:47 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E5=A2=9E=E5=8A=A0emotion2vec=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/auto/auto_model.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 11399d286..1ae8dadc9 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -462,9 +462,14 @@ def inference_with_vad(self, input, input_len=None, **cfg): ) results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"] if self.ser_model is not None: - ser_res = self.inference(speech_b, input_len=None, model=self.ser_model, kwargs=self.ser_kwargs, **cfg) + ser_res = self.inference(speech_b, input_len=None, model=self.ser_model, + kwargs=self.ser_kwargs, **cfg) if "SenseVoiceSmall" in kwargs.get("ser_model", None): results[_b]["ser_type"] = [i['text'].split("|><|")[1] for i in ser_res] + elif "emotion2vec" in kwargs.get("ser_model", None): + results[_b]["ser_type"] = [i['labels'][i["scores"].index(max(i["scores"]))] for i in ser_res] + + beg_idx = end_idx end_idx += 1 From dc06a80dbc32164626dafe216b41763679d40711 Mon Sep 17 00:00:00 2001 From: wuhongsheng <664116298@qq.com> Date: Mon, 19 Aug 2024 09:27:34 +0800 Subject: [PATCH 9/9] =?UTF-8?q?fix:=E6=83=85=E7=BB=AA=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E5=90=8E=E5=A4=84=E7=90=86bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- funasr/auto/auto_model.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 1ae8dadc9..c4af0d980 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -587,12 +587,13 @@ def inference_with_vad(self, input, input_len=None, **cfg): raw_text, return_raw_text=return_raw_text, ) - if len(sentence_list) == len(result["ser_type"]): - for i in range(len(sentence_list)): - sentence_list[i]["emotion"] = result["ser_type"][i] - else: - merged_list = [[x[0], x[1], y] for x, y in zip(all_segments, result["ser_type"])] - distribute_emotion(sentence_list, merged_list) + if "ser_type" in result: + if len(sentence_list) == len(result["ser_type"]): + for i in range(len(sentence_list)): + sentence_list[i]["emotion"] = result["ser_type"][i] + else: + merged_list = [[x[0], x[1], y] for x, y in zip(all_segments, result["ser_type"])] + distribute_emotion(sentence_list, merged_list) distribute_spk(sentence_list, sv_output) result["sentence_info"] = sentence_list elif kwargs.get("sentence_timestamp", False):