Skip to content

Commit 03dd159

Browse files
committed
Add options to convert vocal features to log scale
1 parent 1728a57 commit 03dd159

1 file changed

Lines changed: 32 additions & 6 deletions

File tree

speechbrain/processing/voice_analysis.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def vocal_characteristics(
2525
sample_rate: int = 16000,
2626
harmonicity_threshold: float = 0.45,
2727
jitter_threshold: float = 0.02,
28+
log_score: bool = True,
2829
):
2930
"""Estimates the vocal characteristics of a signal using auto-correlation, etc.
3031
@@ -50,6 +51,8 @@ def vocal_characteristics(
5051
jitter_threshold: float
5152
One of two threshold values for considering a frame as voiced. Estimated
5253
jitter values greater than this are conisdered unvoiced.
54+
log_scores: bool
55+
Whether to represent the jitter/shimmer/hnr on a log scale
5356
5457
Returns
5558
-------
@@ -100,8 +103,13 @@ def vocal_characteristics(
100103
# By J. Fernandez, F. Teixeira, V. Guedes, A. Junior, and J. P. Teixeira
101104
# Term is dominated by denominator, so just take -1 * log(noise)
102105
# max value for harmonicity is 25 dB, enforced by this minimum here
103-
noise = torch.clamp(1 - harmonicity, min=EPSILON)
104-
hnr = -10 * torch.log10(noise)
106+
if log_score:
107+
noise = torch.clamp(1 - harmonicity, min=EPSILON)
108+
hnr = -10 * torch.log10(noise)
109+
jitter = -10 * torch.log10(jitter.clamp(min=EPSILON))
110+
shimmer = -10 * torch.log10(shimmer.clamp(min=EPSILON))
111+
else:
112+
hnr = 1 - harmonicity
105113

106114
return estimated_f0, voiced, jitter, shimmer, hnr
107115

@@ -411,7 +419,16 @@ def compute_cross_correlation(frames_a, frames_b, width=None):
411419
return cross_correlation
412420

413421

414-
def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
422+
@torch.no_grad()
423+
def compute_gne(
424+
audio,
425+
sample_rate=16000,
426+
bandwidth=1000,
427+
fshift=300,
428+
frame_size=300,
429+
hop_size=100,
430+
log_scale=True,
431+
):
415432
"""An algorithm for GNE computation from the original paper:
416433
417434
"Glottal-to-Noise Excitation Ratio - a New Measure for Describing
@@ -438,6 +455,12 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
438455
The width of the frequency bands used for computing correlation.
439456
fshift : float
440457
The shift between frequency bands used for computing correlation.
458+
frame_size : int
459+
Number of samples (at 10k sampling rate) in each analysis frame.
460+
hop_size : int
461+
Number of samples (at 10k sampling rate) between the start of each analysis frame.
462+
log_scale : bool
463+
Whether to represent the output in the log scale.
441464
442465
Returns
443466
-------
@@ -453,8 +476,8 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
453476
old_sample_rate, sample_rate = sample_rate, 10000
454477
audio = torchaudio.functional.resample(audio, old_sample_rate, sample_rate)
455478

456-
# Step 2. Inverse filter with 30-msec window, 10-msec hop and 13th order LPC
457-
frame_size, hop_size, order = 300, 100, 13
479+
# Step 2. Inverse filter with 13th order LPC
480+
order = 13
458481
window = torch.hann_window(frame_size, device=audio.device).view(1, 1, -1)
459482
audio = torch.nn.functional.pad(audio, (0, frame_size))
460483
frames = audio.unfold(dimension=-1, size=frame_size, step=hop_size) * window
@@ -482,4 +505,7 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
482505
gne = torch.stack(correlations, dim=-1).amax(dim=(2, 3))
483506

484507
# Use a log scale for better differentiation
485-
return -10 * torch.log10(torch.clamp(1 - gne, min=EPSILON))
508+
if log_scale:
509+
return -10 * torch.log10(torch.clamp(1 - gne, min=EPSILON))
510+
else:
511+
return gne

0 commit comments

Comments
 (0)