forked from r9y9/wavenet_vocoder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cmu_arctic.py
126 lines (102 loc) · 4.14 KB
/
cmu_arctic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.datasets import cmu_arctic
from nnmnkwii.io import hts
from nnmnkwii import preprocessing as P
from hparams import hparams
from os.path import exists
import librosa
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
speakers = cmu_arctic.available_speakers
wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers)
wav_paths = wd.collect_files()
speaker_ids = wd.labels
for index, (speaker_id, wav_path) in enumerate(
zip(speaker_ids, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A")))
return [future.result() for future in tqdm(futures)]
def start_at(labels):
has_silence = labels[0][-1] == "pau"
if not has_silence:
return labels[0][0]
for i in range(1, len(labels)):
if labels[i][-1] != "pau":
return labels[i][0]
assert False
def end_at(labels):
has_silence = labels[-1][-1] == "pau"
if not has_silence:
return labels[-1][1]
for i in range(len(labels) - 2, 0, -1):
if labels[i][-1] != "pau":
return labels[i][1]
assert False
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
sr = hparams.sample_rate
# Load the audio to a numpy array. Resampled if needed
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
# TODO
if exists(lab_path) and False:
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=20)
else:
wav, _ = librosa.effects.trim(wav, top_db=20)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
# [0, quantize_channels)
out = P.mulaw_quantize(wav, hparams.quantize_channels)
# Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start:end]
out = out[start:end]
constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
# [-1, 1]
out = P.mulaw(wav, hparams.quantize_channels)
constant_values = P.mulaw(0.0, hparams.quantize_channels)
out_dtype = np.float32
else:
# [-1, 1]
out = wav
constant_values = 0.0
out_dtype = np.float32
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft
# this is needed to adjust time resolution between audio and mel-spectrogram
l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
# zero pad for quantized signal
out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
N = mel_spectrogram.shape[0]
assert len(out) >= N * audio.get_hop_size()
# time resolution adjustment
# ensure length of raw audio is multiple of hop_size so that we can use
# transposed convolution to upsample
out = out[:N * audio.get_hop_size()]
assert len(out) % audio.get_hop_size() == 0
timesteps = len(out)
# Write the spectrograms to disk:
audio_filename = 'cmu_arctic-audio-%05d.npy' % index
mel_filename = 'cmu_arctic-mel-%05d.npy' % index
np.save(os.path.join(out_dir, audio_filename),
out.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.astype(np.float32), allow_pickle=False)
# Return a tuple describing this training example:
return (audio_filename, mel_filename, timesteps, text, speaker_id)