@@ -101,7 +101,8 @@ def compute_fbank(self,
101101 num_mel_bins = num_mel_bins ,
102102 frame_length = frame_length ,
103103 frame_shift = frame_shift ,
104- sample_frequency = sample_rate )
104+ sample_frequency = sample_rate ,
105+ windown_type = 'hamming' )
105106 if cmn :
106107 feat = feat - torch .mean (feat , 0 )
107108 return feat
@@ -117,8 +118,8 @@ def extract_embedding_feats(self, fbanks, batch_size, subseg_cmn):
117118 batch_feats = fbanks_array [i :i + batch_size ]
118119 with torch .no_grad ():
119120 batch_embs = self .model (batch_feats )
120- batch_embs = batch_embs [- 1 ] if isinstance (batch_embs ,
121- tuple ) else batch_embs
121+ batch_embs = batch_embs [- 1 ] if isinstance (
122+ batch_embs , tuple ) else batch_embs
122123 embeddings .append (batch_embs .detach ().cpu ().numpy ())
123124 embeddings = np .vstack (embeddings )
124125 return embeddings
@@ -139,10 +140,11 @@ def extract_embedding_from_pcm(self, pcm: torch.Tensor, sample_rate: int):
139140
140141 if sample_rate != vad_sample_rate :
141142 transform = torchaudio .transforms .Resample (
142- orig_freq = sample_rate ,
143- new_freq = vad_sample_rate )
143+ orig_freq = sample_rate , new_freq = vad_sample_rate )
144144 wav = transform (wav )
145- segments = get_speech_timestamps (wav , self .vad , return_seconds = True )
145+ segments = get_speech_timestamps (wav ,
146+ self .vad ,
147+ return_seconds = True )
146148 pcmTotal = torch .Tensor ()
147149 if len (segments ) > 0 : # remove all the silence
148150 for segment in segments :
@@ -218,7 +220,9 @@ def diarize(self, audio_path: str, utt: str = "unk"):
218220 pcm , sample_rate = torchaudio .load (audio_path , normalize = False )
219221 # 1. vad
220222 wav = read_audio (audio_path )
221- vad_segments = get_speech_timestamps (wav , self .vad , return_seconds = True )
223+ vad_segments = get_speech_timestamps (wav ,
224+ self .vad ,
225+ return_seconds = True )
222226
223227 # 2. extact fbanks
224228 subsegs , subseg_fbanks = [], []
0 commit comments