Skip to content

Commit 6636a9d

Browse files
authored
[cli] Using Hamming Window (#407)
1 parent ee797dd commit 6636a9d

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

wespeaker/cli/speaker.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,8 @@ def compute_fbank(self,
101101
num_mel_bins=num_mel_bins,
102102
frame_length=frame_length,
103103
frame_shift=frame_shift,
104-
sample_frequency=sample_rate)
104+
sample_frequency=sample_rate,
105+
windown_type='hamming')
105106
if cmn:
106107
feat = feat - torch.mean(feat, 0)
107108
return feat
@@ -117,8 +118,8 @@ def extract_embedding_feats(self, fbanks, batch_size, subseg_cmn):
117118
batch_feats = fbanks_array[i:i + batch_size]
118119
with torch.no_grad():
119120
batch_embs = self.model(batch_feats)
120-
batch_embs = batch_embs[-1] if isinstance(batch_embs,
121-
tuple) else batch_embs
121+
batch_embs = batch_embs[-1] if isinstance(
122+
batch_embs, tuple) else batch_embs
122123
embeddings.append(batch_embs.detach().cpu().numpy())
123124
embeddings = np.vstack(embeddings)
124125
return embeddings
@@ -139,10 +140,11 @@ def extract_embedding_from_pcm(self, pcm: torch.Tensor, sample_rate: int):
139140

140141
if sample_rate != vad_sample_rate:
141142
transform = torchaudio.transforms.Resample(
142-
orig_freq=sample_rate,
143-
new_freq=vad_sample_rate)
143+
orig_freq=sample_rate, new_freq=vad_sample_rate)
144144
wav = transform(wav)
145-
segments = get_speech_timestamps(wav, self.vad, return_seconds=True)
145+
segments = get_speech_timestamps(wav,
146+
self.vad,
147+
return_seconds=True)
146148
pcmTotal = torch.Tensor()
147149
if len(segments) > 0: # remove all the silence
148150
for segment in segments:
@@ -218,7 +220,9 @@ def diarize(self, audio_path: str, utt: str = "unk"):
218220
pcm, sample_rate = torchaudio.load(audio_path, normalize=False)
219221
# 1. vad
220222
wav = read_audio(audio_path)
221-
vad_segments = get_speech_timestamps(wav, self.vad, return_seconds=True)
223+
vad_segments = get_speech_timestamps(wav,
224+
self.vad,
225+
return_seconds=True)
222226

223227
# 2. extact fbanks
224228
subsegs, subseg_fbanks = [], []

0 commit comments

Comments
 (0)