diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py index d20860203e7e..2ff70c34a2fd 100644 --- a/nemo/collections/asr/models/aed_multitask_models.py +++ b/nemo/collections/asr/models/aed_multitask_models.py @@ -573,7 +573,6 @@ def transcribe( f"but got {type(override_config)}" ) trcfg = override_config - trcfg.timestamps = timestamps if trcfg.enable_chunking: # Check if only one audio is provided with string @@ -1099,6 +1098,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo main_model_predictions=hypotheses, timestamp_type='char' if merge_to_be_done else ['word', 'segment'], viterbi_device=trcfg._internal.device, + verbose=trcfg.verbose, ) elif trcfg.timestamps: hypotheses = process_aed_timestamp_outputs( diff --git a/nemo/collections/asr/parts/mixins/transcription.py b/nemo/collections/asr/parts/mixins/transcription.py index 7a48b99049b3..89f1d76e8172 100644 --- a/nemo/collections/asr/parts/mixins/transcription.py +++ b/nemo/collections/asr/parts/mixins/transcription.py @@ -253,7 +253,6 @@ def transcribe( - Dict[str, List[str/Hypothesis]] """ - if override_config is None: transcribe_cfg = TranscribeConfig( use_lhotse=use_lhotse, @@ -348,7 +347,6 @@ def transcribe_generator(self, audio, override_config: Optional[TranscribeConfig """ A generator version of `transcribe` function. """ - if override_config is None: override_config = TranscribeConfig() diff --git a/nemo/collections/asr/parts/utils/aligner_utils.py b/nemo/collections/asr/parts/utils/aligner_utils.py index 5d2b719c2521..2c6b5f1f5138 100644 --- a/nemo/collections/asr/parts/utils/aligner_utils.py +++ b/nemo/collections/asr/parts/utils/aligner_utils.py @@ -878,6 +878,7 @@ def get_batch_variables( buffered_chunk_params: dict = {}, padding_value: float = -3.4e38, has_hypotheses: bool = False, + verbose: bool = False, ): """ Args: @@ -947,7 +948,9 @@ def get_batch_variables( if has_hypotheses: hypotheses = audio else: - hypotheses = model.transcribe(audio, return_hypotheses=True, batch_size=batch_size) + hypotheses = model.transcribe( + audio, return_hypotheses=True, batch_size=batch_size, verbose=verbose + ) else: assert isinstance(audio, list) or isinstance( audio, str diff --git a/nemo/collections/asr/parts/utils/timestamp_utils.py b/nemo/collections/asr/parts/utils/timestamp_utils.py index 6e21fdf08d75..f7f9f84722e4 100644 --- a/nemo/collections/asr/parts/utils/timestamp_utils.py +++ b/nemo/collections/asr/parts/utils/timestamp_utils.py @@ -490,6 +490,7 @@ def get_forced_aligned_timestamps_with_external_model( supported_punctuation: Optional[Union[Set, List[str]]] = {',', '.', '!', '?'}, timestamp_type: Optional[Union[str, List[str]]] = "all", has_hypotheses: bool = False, + verbose: bool = False, ) -> List[Hypothesis]: """ Extracts the word, segment and char timestamps by aligning the audio with the external ASR model and adds them to the provided Hypothesis objects. @@ -649,6 +650,7 @@ def process_timestamps(utt_obj, output_timestep_duration, timestamp_type): word_separator=word_separator, gt_text_batch=[hyp.text for hyp in main_model_predictions[start_idx:end_idx]], has_hypotheses=has_hypotheses, + verbose=verbose, ) alignments_batch = viterbi_decoding(