@@ -25,6 +25,7 @@ def vocal_characteristics(
2525 sample_rate : int = 16000 ,
2626 harmonicity_threshold : float = 0.45 ,
2727 jitter_threshold : float = 0.02 ,
28+ log_score : bool = True ,
2829):
2930 """Estimates the vocal characteristics of a signal using auto-correlation, etc.
3031
@@ -50,6 +51,8 @@ def vocal_characteristics(
5051 jitter_threshold: float
5152 One of two threshold values for considering a frame as voiced. Estimated
5253 jitter values greater than this are conisdered unvoiced.
54+ log_scores: bool
55+ Whether to represent the jitter/shimmer/hnr on a log scale
5356
5457 Returns
5558 -------
@@ -100,8 +103,13 @@ def vocal_characteristics(
100103 # By J. Fernandez, F. Teixeira, V. Guedes, A. Junior, and J. P. Teixeira
101104 # Term is dominated by denominator, so just take -1 * log(noise)
102105 # max value for harmonicity is 25 dB, enforced by this minimum here
103- noise = torch .clamp (1 - harmonicity , min = EPSILON )
104- hnr = - 10 * torch .log10 (noise )
106+ if log_score :
107+ noise = torch .clamp (1 - harmonicity , min = EPSILON )
108+ hnr = - 10 * torch .log10 (noise )
109+ jitter = - 10 * torch .log10 (jitter .clamp (min = EPSILON ))
110+ shimmer = - 10 * torch .log10 (shimmer .clamp (min = EPSILON ))
111+ else :
112+ hnr = 1 - harmonicity
105113
106114 return estimated_f0 , voiced , jitter , shimmer , hnr
107115
@@ -411,7 +419,16 @@ def compute_cross_correlation(frames_a, frames_b, width=None):
411419 return cross_correlation
412420
413421
414- def compute_gne (audio , sample_rate = 16000 , bandwidth = 1000 , fshift = 300 ):
422+ @torch .no_grad ()
423+ def compute_gne (
424+ audio ,
425+ sample_rate = 16000 ,
426+ bandwidth = 1000 ,
427+ fshift = 300 ,
428+ frame_size = 300 ,
429+ hop_size = 100 ,
430+ log_scale = True ,
431+ ):
415432 """An algorithm for GNE computation from the original paper:
416433
417434 "Glottal-to-Noise Excitation Ratio - a New Measure for Describing
@@ -438,6 +455,12 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
438455 The width of the frequency bands used for computing correlation.
439456 fshift : float
440457 The shift between frequency bands used for computing correlation.
458+ frame_size : int
459+ Number of samples (at 10k sampling rate) in each analysis frame.
460+ hop_size : int
461+ Number of samples (at 10k sampling rate) between the start of each analysis frame.
462+ log_scale : bool
463+ Whether to represent the output in the log scale.
441464
442465 Returns
443466 -------
@@ -453,8 +476,8 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
453476 old_sample_rate , sample_rate = sample_rate , 10000
454477 audio = torchaudio .functional .resample (audio , old_sample_rate , sample_rate )
455478
456- # Step 2. Inverse filter with 30-msec window, 10-msec hop and 13th order LPC
457- frame_size , hop_size , order = 300 , 100 , 13
479+ # Step 2. Inverse filter with 13th order LPC
480+ order = 13
458481 window = torch .hann_window (frame_size , device = audio .device ).view (1 , 1 , - 1 )
459482 audio = torch .nn .functional .pad (audio , (0 , frame_size ))
460483 frames = audio .unfold (dimension = - 1 , size = frame_size , step = hop_size ) * window
@@ -482,4 +505,7 @@ def compute_gne(audio, sample_rate=16000, bandwidth=1000, fshift=300):
482505 gne = torch .stack (correlations , dim = - 1 ).amax (dim = (2 , 3 ))
483506
484507 # Use a log scale for better differentiation
485- return - 10 * torch .log10 (torch .clamp (1 - gne , min = EPSILON ))
508+ if log_scale :
509+ return - 10 * torch .log10 (torch .clamp (1 - gne , min = EPSILON ))
510+ else :
511+ return gne
0 commit comments