音声処理 | えやみぐさ

PyTorch pydub.AudioSegmentをtorch.Tensorに変換する

import numpy as np import torch import torchaudio import torchaudio.transforms as T ''' in: pydub.AudioSegment out: torch.Tensor (float32) ''' def to_tensor(audio): sample_width = audio.sample_width sample_bits = 8 * sample_width sample_max_int = 2 ** sample_bits sample_channels = audio.channels samples = np.asarray(audio.get_array_of_samples()) samples = samples.reshape((-1, 2)).transpose((1, 0)) # LRLR -> Channel, Samples samples = samples.astype('f') / sample_max_int samples = torch.from_numpy(samples).type(torch.float32) return samples

PortAudio, pyaudio

PortAudio: PortAudio API Overview Host API ALSA (Ubuntu) Core Audio (Mac) Device Speaker Mic Stream sample rate (num of samples per second) sample format (num of bytes of a sample, integer or float) PyAudio Documentation — PyAudio 0.2.11 documentation Audio device detection w/ pyaudio brew install portaudio pip3 install pyaudio macOSにpyaudioをインストールする - Qiita Check Host APIs import pyaudio pa = pyaudio.PyAudio() api_count = pa.get_host_api_count() print('Host API:', api_count) for i in range(api_count): api_info = pa.get_host_api_info_by_index(i) print(api_info) device_count = api_info['deviceCount'] for j in range(device_count): device_info = pa.get_device_info_by_host_api_device_index(i, j) print(device_info) pa.terminate() Host API: 1 {'index': 0, 'structVersion': 1, 'type': 5, 'name': 'Core Audio', 'deviceCount': 2, 'defaultInputDevice': 0, 'defaultOutputDevice': 1} {'index': 0, 'structVersion': 2, 'name': 'Built-in Microphone', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.0029478458049886623, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.01310657596371882, 'defaultHighOutputLatency': 0.1, 'defaultSampleRate': 44100.0} {'index': 1, 'structVersion': 2, 'name': 'Built-in Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.007800453514739229, 'defaultHighInputLatency': 0.1, 'defaultHighOutputLatency': 0.017959183673469388, 'defaultSampleRate': 44100.0} Check Devices import pyaudio pa = pyaudio.PyAudio() device_count = pa.get_device_count() print('Device:', device_count) for i in range(device_count): device_info = pa.get_device_info_by_index(i) print(device_info) pa.terminate() Device: 2 {'index': 0, 'structVersion': 2, 'name': 'Built-in Microphone', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.0029478458049886623, 'defaultLowOutputLatency': 0.01, 'defaultHighInputLatency': 0.01310657596371882, 'defaultHighOutputLatency': 0.1, 'defaultSampleRate': 44100.0} {'index': 1, 'structVersion': 2, 'name': 'Built-in Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.01, 'defaultLowOutputLatency': 0.007800453514739229, 'defaultHighInputLatency': 0.1, 'defaultHighOutputLatency': 0.017959183673469388, 'defaultSampleRate': 44100.0} Stream ※ waveだけ鳴らせればいい場合はwaveモジュールを使ってください（参照） ...

pydub numpy

from pydub import * import numpy as np import time # https://own-search-and-study.xyz/2017/11/19/numpy%E3%81%AEarray%E3%81%8B%E3%82%89pydub%E3%81%AEaudiosegment%E3%82%92%E4%BD%9C%E6%88%90%E3%81%99%E3%82%8B/ # https://maoudamashii.jokersounds.com/archives/song_kouichi_the_milky_way.html path = 'song_kouichi_the_milky_way.m4a' # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentfrom_file sound = AudioSegment.from_file(path, format='m4a') # give format explicitly samples = np.array(sound.get_array_of_samples()) print(path) print('Sample width (Num of bytes of a sample):', sound.sample_width) print('Frame rate (Num of samples per second):', sound.frame_rate) print('Channels (Stereo/Mono):', sound.channels) print('Shape (Length):', samples.shape) print('Type:', samples.dtype) print('Min/Max:', samples.min(), samples.max()) output = AudioSegment( samples.astype('int32').tobytes(), sample_width=4, frame_rate=44100, channels=2, ) ts = time.time() output.export('output.m4a') elapsed = time.time() - ts print('Exported as m4a: %f s' % elapsed) ts = time.time() output.export('output.mp3') elapsed = time.time() - ts print('Exported as mp3: %f s' % elapsed) ts = time.time() output.export('output.wav') elapsed = time.time() - ts print('Exported as wav: %f s' % elapsed) song_kouichi_the_milky_way.m4a Sample width (Num of bytes of a sample): 2 Frame rate (Num of samples per second): 44100 Channels (Stereo/Mono): 2 Shape (Length): (22339584,) Type: int16 Min/Max: -32768 32767 Exported as m4a: 6.288757 s Exported as mp3: 6.194534 s Exported as wav: 6.064215 s 出力時間はフォーマットによって変わらない（誤差の範囲）