From aff7f83a1346cd47aaf6673be097403e582311ff Mon Sep 17 00:00:00 2001 From: valrus Date: Sat, 11 Jan 2020 14:50:37 -0800 Subject: [PATCH 01/15] Add an option allowing use without a multiprocessing pool... ...this allows spleeter itself to be run asynchronously from another process using multiprocessing, which otherwise fails with "daemonic processes are not allowed to have children" --- spleeter/separator.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/spleeter/separator.py b/spleeter/separator.py index c1c8f0e..14a7f67 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -35,7 +35,7 @@ __license__ = 'MIT License' class Separator(object): """ A wrapper class for performing separation. """ - def __init__(self, params_descriptor, MWF=False): + def __init__(self, params_descriptor, MWF=False, multiprocess=True): """ Default constructor. :param params_descriptor: Descriptor for TF params to be used. @@ -45,7 +45,7 @@ class Separator(object): self._sample_rate = self._params['sample_rate'] self._MWF = MWF self._predictor = None - self._pool = Pool() + self._pool = Pool() if multiprocess else None self._tasks = [] def _get_predictor(self): @@ -133,12 +133,15 @@ class Separator(object): f'Separated source path conflict : {path},' 'please check your filename format')) generated.append(path) - task = self._pool.apply_async(audio_adapter.save, ( - path, - data, - self._sample_rate, - codec, - bitrate)) - self._tasks.append(task) - if synchronous: + if self._pool: + task = self._pool.apply_async(audio_adapter.save, ( + path, + data, + self._sample_rate, + codec, + bitrate)) + self._tasks.append(task) + else: + audio_adapter.save(path, data, self._sample_rate, codec, bitrate) + if synchronous and self._pool: self.join() From aa0cc14a8cc3002e4fbb8e61058cddee03cc5b57 Mon Sep 17 00:00:00 2001 From: Rob Moore Date: Tue, 21 Jan 2020 15:03:44 +0000 Subject: [PATCH 02/15] fix(config.yml): docker run 4 and 5 stem containers The `docker run` job for embedded 4 and 5 stem images was, by default, using the 2stems model. The 2stem model is downloaded each time. This change uses the -p option to use the 4 and 5 stem models, respectively. --- .circleci/config.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2fc1e5d..a1d8826 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -103,8 +103,8 @@ jobs: - run: docker build --build-arg BASETAG=conda --build-arg MODEL=5stems -t researchdeezer/spleeter:conda-5stems -f docker/embedded-model.dockerfile . - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda separate -i /runtime/audio_example.mp3 -o /tmp - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda-2stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda-4stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda-5stems separate -i /runtime/audio_example.mp3 -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda-4stems separate -i /runtime/audio_example.mp3 -p spleeter:4stems -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:conda-5stems separate -i /runtime/audio_example.mp3 -p spleeter:5stems -o /tmp - run: docker login -u $DOCKERHUB_USERNAME -p $DOCKERHUB_PASSWORD - run: docker push researchdeezer/spleeter:conda - run: docker push researchdeezer/spleeter:conda-2stems @@ -139,8 +139,8 @@ jobs: - run: docker build --build-arg BASETAG=3.6 --build-arg MODEL=5stems -t researchdeezer/spleeter:3.6-5stems -f docker/embedded-model.dockerfile . - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6 separate -i /runtime/audio_example.mp3 -o /tmp - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6-2stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6-4stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6-5stems separate -i /runtime/audio_example.mp3 -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6-4stems separate -i /runtime/audio_example.mp3 -p spleeter:4stems -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.6-5stems separate -i /runtime/audio_example.mp3 -p spleeter:5stems -o /tmp - run: docker login -u $DOCKERHUB_USERNAME -p $DOCKERHUB_PASSWORD - run: docker push researchdeezer/spleeter:3.6 - run: docker push researchdeezer/spleeter:3.6-2stems @@ -173,8 +173,8 @@ jobs: - run: docker build --build-arg BASETAG=3.7 --build-arg MODEL=5stems -t researchdeezer/spleeter:3.7-5stems -f docker/embedded-model.dockerfile . - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7 separate -i /runtime/audio_example.mp3 -o /tmp - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7-2stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7-4stems separate -i /runtime/audio_example.mp3 -o /tmp - - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7-5stems separate -i /runtime/audio_example.mp3 -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7-4stems separate -i /runtime/audio_example.mp3 -p spleeter:4stems -o /tmp + - run: docker run -v $(pwd):/runtime researchdeezer/spleeter:3.7-5stems separate -i /runtime/audio_example.mp3 -p spleeter:5stems -o /tmp - run: docker login -u $DOCKERHUB_USERNAME -p $DOCKERHUB_PASSWORD - run: docker tag researchdeezer/spleeter:3.7 researchdeezer/spleeter:latest - run: docker push researchdeezer/spleeter:latest @@ -275,4 +275,4 @@ workflows: filters: branches: only: - - master \ No newline at end of file + - master From d2604235ac0008f389469551528aec74c460de6e Mon Sep 17 00:00:00 2001 From: Yusuke Goto Date: Wed, 1 Jan 2020 16:51:22 +0900 Subject: [PATCH 03/15] fix: switch up to 1.4.9 --- docker/conda-gpu.dockerfile | 2 +- docker/conda.dockerfile | 2 +- docker/python-3.6-gpu.dockerfile | 2 +- docker/python-3.6.dockerfile | 2 +- docker/python-3.7-gpu.dockerfile | 2 +- docker/python-3.7.dockerfile | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/conda-gpu.dockerfile b/docker/conda-gpu.dockerfile index 10d7631..2fb92d7 100644 --- a/docker/conda-gpu.dockerfile +++ b/docker/conda-gpu.dockerfile @@ -17,7 +17,7 @@ COPY audio_example.mp3 . RUN conda install -y -c conda-forge musdb # RUN conda install -y -c conda-forge museval -RUN conda install -y -c conda-forge spleeter-gpu=1.4.5 +RUN conda install -y -c conda-forge spleeter-gpu=1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file diff --git a/docker/conda.dockerfile b/docker/conda.dockerfile index c40d0b7..1ecf193 100644 --- a/docker/conda.dockerfile +++ b/docker/conda.dockerfile @@ -6,6 +6,6 @@ COPY audio_example.mp3 . RUN conda install -y -c conda-forge musdb # RUN conda install -y -c conda-forge museval -RUN conda install -y -c conda-forge spleeter=1.4.5 +RUN conda install -y -c conda-forge spleeter=1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file diff --git a/docker/python-3.6-gpu.dockerfile b/docker/python-3.6-gpu.dockerfile index 3fe010b..4fcd596 100644 --- a/docker/python-3.6-gpu.dockerfile +++ b/docker/python-3.6-gpu.dockerfile @@ -52,6 +52,6 @@ COPY audio_example.mp3 . # Spleeter installation. RUN apt-get update && apt-get install -y ffmpeg libsndfile1 RUN pip install musdb museval -RUN pip install spleeter-gpu==1.4.5 +RUN pip install spleeter-gpu==1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file diff --git a/docker/python-3.6.dockerfile b/docker/python-3.6.dockerfile index 7654ff1..0cad379 100644 --- a/docker/python-3.6.dockerfile +++ b/docker/python-3.6.dockerfile @@ -6,6 +6,6 @@ COPY audio_example.mp3 . RUN apt-get update && apt-get install -y ffmpeg libsndfile1 RUN pip install musdb museval -RUN pip install spleeter==1.4.5 +RUN pip install spleeter==1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file diff --git a/docker/python-3.7-gpu.dockerfile b/docker/python-3.7-gpu.dockerfile index b94b84c..e4e4de6 100644 --- a/docker/python-3.7-gpu.dockerfile +++ b/docker/python-3.7-gpu.dockerfile @@ -52,6 +52,6 @@ COPY audio_example.mp3 . # Spleeter installation. RUN apt-get update && apt-get install -y ffmpeg libsndfile1 RUN pip install musdb museval -RUN pip install spleeter-gpu==1.4.5 +RUN pip install spleeter-gpu==1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file diff --git a/docker/python-3.7.dockerfile b/docker/python-3.7.dockerfile index 223d0ed..efa9c1f 100644 --- a/docker/python-3.7.dockerfile +++ b/docker/python-3.7.dockerfile @@ -6,6 +6,6 @@ COPY audio_example.mp3 . RUN apt-get update && apt-get install -y ffmpeg libsndfile1 RUN pip install musdb museval -RUN pip install spleeter==1.4.5 +RUN pip install spleeter==1.4.9 ENTRYPOINT ["spleeter"] \ No newline at end of file From a23a41a3800ce43b1c9edbe9ccca1cf3578a10d6 Mon Sep 17 00:00:00 2001 From: mmoussallam Date: Mon, 27 Jan 2020 15:10:34 +0100 Subject: [PATCH 04/15] fixing possible race condition in output writing --- spleeter/audio/ffmpeg.py | 8 ++++---- spleeter/separator.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/spleeter/audio/ffmpeg.py b/spleeter/audio/ffmpeg.py index 168ec36..08218e9 100644 --- a/spleeter/audio/ffmpeg.py +++ b/spleeter/audio/ffmpeg.py @@ -112,9 +112,9 @@ class FFMPEGProcessAudioAdapter(AudioAdapter): :param bitrate: (Optional) Bitrate of the written audio file. :raise IOError: If any error occurs while using FFMPEG to write data. """ - directory = os.path.split(path)[0] + directory = os.path.dirname(path) if not os.path.exists(directory): - os.makedirs(directory) + raise SpleeterError(f'output directory does not exists: {directory}') get_logger().debug('Writing file %s', path) input_kwargs = {'ar': sample_rate, 'ac': data.shape[1]} output_kwargs = {'ar': sample_rate, 'strict': '-2'} @@ -127,11 +127,11 @@ class FFMPEGProcessAudioAdapter(AudioAdapter): .input('pipe:', format='f32le', **input_kwargs) .output(path, **output_kwargs) .overwrite_output() - .run_async(pipe_stdin=True, quiet=True)) + .run_async(pipe_stdin=True, pipe_stderr=True, quiet=True)) try: process.stdin.write(data.astype(' Date: Wed, 19 Feb 2020 10:55:48 +0100 Subject: [PATCH 05/15] Adding a new argument to support chunked inference --- spleeter/commands/__init__.py | 13 +++++++++++++ spleeter/separator.py | 30 +++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spleeter/commands/__init__.py b/spleeter/commands/__init__.py index 2bbc974..9461a07 100644 --- a/spleeter/commands/__init__.py +++ b/spleeter/commands/__init__.py @@ -68,6 +68,18 @@ OPT_DURATION = { 'the input file)') } +# -w opt specification (separate) +OPT_CHUNKED = { + 'dest': 'chunk_duration', + 'type': float, + 'default': -1, + 'help': 'Maximum duration of the segments that are fed to' + ' the network. Use this parameter to limit ' + 'memory usage. Use -1 to process the whole signal' + ' in one pass.' +} + + # -c opt specification (separate). OPT_CODEC = { 'dest': 'codec', @@ -176,6 +188,7 @@ def _create_separate_parser(parser_factory): parser.add_argument('-c', '--codec', **OPT_CODEC) parser.add_argument('-b', '--birate', **OPT_BITRATE) parser.add_argument('-m', '--mwf', **OPT_MWF) + parser.add_argument('-w', '--chunk', **OPT_CHUNKED) return parser diff --git a/spleeter/separator.py b/spleeter/separator.py index 3ef3ecf..4423888 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -13,17 +13,14 @@ """ import os -import json -from functools import partial from multiprocessing import Pool -from pathlib import Path from os.path import basename, join, splitext +import numpy as np from . import SpleeterError from .audio.adapter import get_default_audio_adapter from .audio.convertor import to_stereo -from .model import model_fn from .utils.configuration import load_configuration from .utils.estimator import create_estimator, to_predictor @@ -90,9 +87,21 @@ class Separator(object): prediction.pop('audio_id') return prediction + def separate_chunked(self, waveform, sample_rate, chunk_duration=-1): + chunk_size = waveform.shape[0] if chunk_duration == -1 else chunk_duration*sample_rate + n_chunks = int(waveform.shape[0]/chunk_size) + out = {} + for i in range(n_chunks): + sources = self.separate(waveform) + for inst, data in sources.items(): + out.setdefault(inst, []).append(data) + for inst, data in out.items(): + out[inst] = np.concatenate(data, axis=0) + return out + def separate_to_file( self, audio_descriptor, destination, - audio_adapter=get_default_audio_adapter(), + audio_adapter=get_default_audio_adapter(), chunk_duration=-1, offset=0, duration=600., codec='wav', bitrate='128k', filename_format='{filename}/{instrument}.{codec}', synchronous=True): @@ -108,6 +117,8 @@ class Separator(object): descriptor would be a file path. :param destination: Target directory to write output to. :param audio_adapter: (Optional) Audio adapter to use for I/O. + :param chunk_duration: (Optional) Maximum signal duration that is processed + in one pass. Default: all signal. :param offset: (Optional) Offset of loaded song. :param duration: (Optional) Duration of loaded song. :param codec: (Optional) Export codec. @@ -115,12 +126,17 @@ class Separator(object): :param filename_format: (Optional) Filename format. :param synchronous: (Optional) True is should by synchronous. """ - waveform, _ = audio_adapter.load( + waveform, sample_rate = audio_adapter.load( audio_descriptor, offset=offset, duration=duration, sample_rate=self._sample_rate) - sources = self.separate(waveform) + sources = self.separate_chunked(waveform, sample_rate, chunk_duration=chunk_duration) + self.save_to_file(sources, audio_descriptor, destination, filename_format, codec, + audio_adapter, bitrate, synchronous) + + def save_to_file(self, sources, audio_descriptor, destination, filename_format, codec, + audio_adapter, bitrate, synchronous): filename = splitext(basename(audio_descriptor))[0] generated = [] for instrument, data in sources.items(): From aa7c208b3903192ce92584c42e99263eda764302 Mon Sep 17 00:00:00 2001 From: akhlif Date: Wed, 19 Feb 2020 23:11:16 +0100 Subject: [PATCH 06/15] First draft implementing correct reconstruction --- spleeter/commands/__init__.py | 5 +- spleeter/commands/separate.py | 1 + spleeter/model/__init__.py | 56 +++++++++++++------- spleeter/separator.py | 97 +++++++++++++++++++++++++++++++---- spleeter/utils/estimator.py | 31 ++++++++--- 5 files changed, 152 insertions(+), 38 deletions(-) diff --git a/spleeter/commands/__init__.py b/spleeter/commands/__init__.py index 9461a07..21b70ef 100644 --- a/spleeter/commands/__init__.py +++ b/spleeter/commands/__init__.py @@ -4,7 +4,7 @@ """ This modules provides spleeter command as well as CLI parsing methods. """ import json - +import logging from argparse import ArgumentParser from tempfile import gettempdir from os.path import exists, join @@ -13,6 +13,9 @@ __email__ = 'research@deezer.com' __author__ = 'Deezer Research' __license__ = 'MIT License' +logging.basicConfig() + + # -i opt specification (separate). OPT_INPUT = { 'dest': 'inputs', diff --git a/spleeter/commands/separate.py b/spleeter/commands/separate.py index b1f41ab..b3de4b6 100644 --- a/spleeter/commands/separate.py +++ b/spleeter/commands/separate.py @@ -35,6 +35,7 @@ def entrypoint(arguments, params): filename, arguments.output_path, audio_adapter=audio_adapter, + chunk_duration=arguments.chunk_duration, offset=arguments.offset, duration=arguments.duration, codec=arguments.codec, diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py index 384e838..4d18eb5 100644 --- a/spleeter/model/__init__.py +++ b/spleeter/model/__init__.py @@ -106,7 +106,7 @@ class EstimatorSpecBuilder(object): self._frame_length = params['frame_length'] self._frame_step = params['frame_step'] - def _build_output_dict(self): + def _build_output_dict(self, input_tensor=None): """ Created a batch_sizexTxFxn_channels input tensor containing mix magnitude spectrogram, then an output dict from it according to the selected model in internal parameters. @@ -114,7 +114,8 @@ class EstimatorSpecBuilder(object): :returns: Build output dict. :raise ValueError: If required model_type is not supported. """ - input_tensor = self._features[f'{self._mix_name}_spectrogram'] + if input_tensor is None: + input_tensor = self._features[f'{self._mix_name}_spectrogram'] model = self._params.get('model', None) if model is not None: model_type = model.get('type', self.DEFAULT_MODEL) @@ -194,6 +195,12 @@ class EstimatorSpecBuilder(object): self._features[f'{self._mix_name}_spectrogram'] = tf.abs( pad_and_partition(stft_feature, self._T))[:, :, :self._F, :] + def get_stft_feature(self): + return self._features[f'{self._mix_name}_stft'] + + def get_spectrogram_feature(self): + return self._features[f'{self._mix_name}_spectrogram'] + def _inverse_stft(self, stft): """ Inverse and reshape the given STFT @@ -269,26 +276,18 @@ class EstimatorSpecBuilder(object): extension = tf.tile(extension_row, [1, 1, n_extra_row, 1]) return tf.concat([mask, extension], axis=2) - def _build_manual_output_waveform(self, output_dict): - """ Perform ratio mask separation - - :param output_dict: dictionary of estimated spectrogram (key: instrument - name, value: estimated spectrogram of the instrument) - :returns: dictionary of separated waveforms (key: instrument name, - value: estimated waveform of the instrument) - """ + def _build_masks(self, output_dict): separation_exponent = self._params['separation_exponent'] output_sum = tf.reduce_sum( [e ** separation_exponent for e in output_dict.values()], axis=0 ) + self.EPSILON - output_waveform = {} + out = {} for instrument in self._instruments: output = output_dict[f'{instrument}_spectrogram'] # Compute mask with the model. - instrument_mask = ( - output ** separation_exponent - + (self.EPSILON / len(output_dict))) / output_sum + instrument_mask = (output ** separation_exponent + + (self.EPSILON / len(output_dict))) / output_sum # Extend mask; instrument_mask = self._extend_mask(instrument_mask) # Stack back mask. @@ -300,10 +299,31 @@ class EstimatorSpecBuilder(object): # Remove padded part (for mask having the same size as STFT); stft_feature = self._features[f'{self._mix_name}_stft'] instrument_mask = instrument_mask[ - :tf.shape(stft_feature)[0], ...] - # Compute masked STFT and normalize it. - output_waveform[instrument] = self._inverse_stft( - tf.cast(instrument_mask, dtype=tf.complex64) * stft_feature) + :tf.shape(stft_feature)[0], ...] + out[instrument] = instrument_mask + return out + + def _build_masked_stft(self, mask_dict, input_stft=None): + if input_stft is None: + input_stft = self._features[f'{self._mix_name}_stft'] + out = {} + for instrument, mask in mask_dict.items(): + out[instrument] = tf.cast(mask, dtype=tf.complex64) * input_stft + return out + + def _build_manual_output_waveform(self, output_dict): + """ Perform ratio mask separation + + :param output_dict: dictionary of estimated spectrogram (key: instrument + name, value: estimated spectrogram of the instrument) + :returns: dictionary of separated waveforms (key: instrument name, + value: estimated waveform of the instrument) + """ + + output_waveform = {} + masked_stft = self._build_masked_stft(self._build_masks(output_dict)) + for instrument, stft_data in masked_stft.items(): + output_waveform[instrument] = self._inverse_stft(stft_data) return output_waveform def _build_output_waveform(self, output_dict): diff --git a/spleeter/separator.py b/spleeter/separator.py index 4423888..f7eac1b 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -13,22 +13,31 @@ """ import os +import logging +from time import time from multiprocessing import Pool from os.path import basename, join, splitext import numpy as np +import tensorflow as tf + from . import SpleeterError from .audio.adapter import get_default_audio_adapter from .audio.convertor import to_stereo from .utils.configuration import load_configuration -from .utils.estimator import create_estimator, to_predictor +from .utils.estimator import create_estimator, to_predictor, get_input_dict_placeholders, get_default_model_dir +from .model import EstimatorSpecBuilder + __email__ = 'research@deezer.com' __author__ = 'Deezer Research' __license__ = 'MIT License' +logger = logging.getLogger("spleeter") + + class Separator(object): """ A wrapper class for performing separation. """ @@ -87,16 +96,79 @@ class Separator(object): prediction.pop('audio_id') return prediction - def separate_chunked(self, waveform, sample_rate, chunk_duration=-1): - chunk_size = waveform.shape[0] if chunk_duration == -1 else chunk_duration*sample_rate - n_chunks = int(waveform.shape[0]/chunk_size) + def get_valid_chunk_size(self, sample_rate: int, chunk_max_duration: float) -> int: + """ + Given a sample rate, and a maximal duration that a chunk can represent, return the maximum chunk + size in samples. The chunk size must be a non-zero multiple of T (temporal dimension of the input spectrogram) + times F (number of frequency bins in the input spectrogram). If no such value exist, we return T*F. + + :param sample_rate: sample rate of the pcm data + :param chunk_max_duration: maximal duration in seconds of a chunk + :return: highest non-zero chunk size of duration less than chunk_max_duration or minimal valid chunk size. + """ + assert chunk_max_duration > 0 + chunk_size = chunk_max_duration * sample_rate + min_sample_size = self._params["T"] * self._params["F"] + if chunk_size < min_sample_size: + min_duration = min_sample_size / sample_rate + logger.warning("chunk_duration must be at least {:.2f} seconds. Ignoring parameter".format(min_duration)) + chunk_size = min_sample_size + return min_sample_size*int(chunk_size/min_sample_size) + + def get_batch_size_for_chunk_size(self, chunk_size): + d = self._params["T"] * self._params["F"] + assert chunk_size % d == 0 + return chunk_size//d + + def separate_chunked(self, waveform, sample_rate, chunk_max_duration): + chunk_size = self.get_valid_chunk_size(sample_rate, chunk_max_duration) + print(f"chunk size is {chunk_size}") + batch_size = self.get_batch_size_for_chunk_size(chunk_size) + print(f"batch size {batch_size}") + T, F = self._params["T"], self._params["F"] out = {} - for i in range(n_chunks): - sources = self.separate(waveform) - for inst, data in sources.items(): - out.setdefault(inst, []).append(data) - for inst, data in out.items(): - out[inst] = np.concatenate(data, axis=0) + n_batches = (waveform.shape[0]+batch_size*T*F-1)//(batch_size*T*F) + print(f"{n_batches} to compute") + features = get_input_dict_placeholders(self._params) + spectrogram_input_t = tf.placeholder(tf.float32, shape=(None, T, F, 2), name="spectrogram_input") + istft_input_t = tf.placeholder(tf.complex64, shape=(None, F, 2), name="istft_input") + start_t = tf.placeholder(tf.int32, shape=(), name="start") + end_t = tf.placeholder(tf.int32, shape=(), name="end") + builder = EstimatorSpecBuilder(features, self._params) + latest_checkpoint = tf.train.latest_checkpoint(get_default_model_dir(self._params['model_dir'])) + + # TODO: fix the logic, build sometimes return, sometimes set attribute + builder._build_stft_feature() + stft_t = builder.get_stft_feature() + output_dict_t = builder._build_output_dict(input_tensor=spectrogram_input_t) + masked_stft_t = builder._build_masked_stft(builder._build_masks(output_dict_t), + input_stft=stft_t[start_t:end_t, :, :]) + output_waveform_t = builder._inverse_stft(istft_input_t) + waveform_t = features["waveform"] + masked_stfts = {} + saver = tf.train.Saver() + + with tf.Session() as sess: + print("restoring weights {}".format(time())) + saver.restore(sess, latest_checkpoint) + print("computing spectrogram {}".format(time())) + spectrogram, stft = sess.run([builder.get_spectrogram_feature(), stft_t], feed_dict={waveform_t: waveform}) + print(spectrogram.shape) + print(stft.shape) + for i in range(n_batches): + print("computing batch {} {}".format(i, time())) + start = i*batch_size + end = (i+1)*batch_size + tmp = sess.run(masked_stft_t, + feed_dict={spectrogram_input_t: spectrogram[start:end, ...], + start_t: start*T, end_t: end*T, stft_t: stft}) + for instrument, masked_stft in tmp.items(): + masked_stfts.setdefault(instrument, []).append(masked_stft) + + print("inverting spectrogram {}".format(time())) + for instrument, masked_stft in masked_stfts.items(): + out[instrument] = sess.run(output_waveform_t, {istft_input_t: np.concatenate(masked_stft, axis=0)}) + print("done separating {}".format(time())) return out def separate_to_file( @@ -126,12 +198,15 @@ class Separator(object): :param filename_format: (Optional) Filename format. :param synchronous: (Optional) True is should by synchronous. """ + print("loading audio {}".format(time())) waveform, sample_rate = audio_adapter.load( audio_descriptor, offset=offset, duration=duration, sample_rate=self._sample_rate) - sources = self.separate_chunked(waveform, sample_rate, chunk_duration=chunk_duration) + print("done loading audio {}".format(time())) + sources = self.separate_chunked(waveform, sample_rate, chunk_duration) + print("saving to file {}".format(time())) self.save_to_file(sources, audio_descriptor, destination, filename_format, codec, audio_adapter, bitrate, synchronous) diff --git a/spleeter/utils/estimator.py b/spleeter/utils/estimator.py index 95c4219..2b17e96 100644 --- a/spleeter/utils/estimator.py +++ b/spleeter/utils/estimator.py @@ -20,20 +20,30 @@ from ..model.provider import get_default_model_provider DEFAULT_EXPORT_DIRECTORY = join(gettempdir(), 'serving') + +def get_default_model_dir(model_dir): + """ + Transforms a string like 'spleeter:2stems' into an actual path. + :param model_dir: + :return: + """ + model_provider = get_default_model_provider() + return model_provider.get(model_dir) + def create_estimator(params, MWF): """ Initialize tensorflow estimator that will perform separation Params: - - params: a dictionnary of parameters for building the model + - params: a dictionary of parameters for building the model Returns: a tensorflow estimator """ # Load model. - model_directory = params['model_dir'] - model_provider = get_default_model_provider() - params['model_dir'] = model_provider.get(model_directory) + + + params['model_dir'] = get_default_model_dir(params['model_dir']) params['MWF'] = MWF # Setup config session_config = tf.compat.v1.ConfigProto() @@ -49,6 +59,14 @@ def create_estimator(params, MWF): return estimator +def get_input_dict_placeholders(params): + shape = (None, params['n_channels']) + features = { + 'waveform': tf.compat.v1.placeholder(tf.float32, shape=shape, name="waveform"), + 'audio_id': tf.compat.v1.placeholder(tf.string, name="audio_id")} + return features + + def to_predictor(estimator, directory=DEFAULT_EXPORT_DIRECTORY): """ Exports given estimator as predictor into the given directory and returns associated tf.predictor instance. @@ -57,10 +75,7 @@ def to_predictor(estimator, directory=DEFAULT_EXPORT_DIRECTORY): :param directory: (Optional) path to write exported model into. """ def receiver(): - shape = (None, estimator.params['n_channels']) - features = { - 'waveform': tf.compat.v1.placeholder(tf.float32, shape=shape), - 'audio_id': tf.compat.v1.placeholder(tf.string)} + features = get_input_dict_placeholders(estimator.params) return tf.estimator.export.ServingInputReceiver(features, features) estimator.export_saved_model(directory, receiver) From fe4634afa68623034423ce256d886c32d0e27d45 Mon Sep 17 00:00:00 2001 From: akhlif Date: Wed, 26 Feb 2020 16:31:24 +0100 Subject: [PATCH 07/15] Adding option to use librosa backend. Changes in the EstimatorBuilder to set attributes instead of returning tensors for the _build methods. InputProvider classes to handle the different backend cases. New method in Separator. --- spleeter/commands/__init__.py | 17 ++- spleeter/commands/separate.py | 12 +- spleeter/model/__init__.py | 230 ++++++++++++++++++++++++++-------- spleeter/separator.py | 83 +++++------- spleeter/utils/estimator.py | 14 +-- 5 files changed, 232 insertions(+), 124 deletions(-) diff --git a/spleeter/commands/__init__.py b/spleeter/commands/__init__.py index 21b70ef..452b3a6 100644 --- a/spleeter/commands/__init__.py +++ b/spleeter/commands/__init__.py @@ -72,14 +72,13 @@ OPT_DURATION = { } # -w opt specification (separate) -OPT_CHUNKED = { - 'dest': 'chunk_duration', - 'type': float, - 'default': -1, - 'help': 'Maximum duration of the segments that are fed to' - ' the network. Use this parameter to limit ' - 'memory usage. Use -1 to process the whole signal' - ' in one pass.' +OPT_STFT_BACKEND = { + 'dest': 'stft_backend', + 'type': str, + 'choices' : ["tensorflow", "librosa", "auto"], + 'default': "auto", + 'help': 'Who should be in charge of computing the stfts. Librosa is faster than tensorflow on CPU and uses' + ' less memory. "auto" will use tensorflow when GPU acceleration is available and librosa when not.' } @@ -191,7 +190,7 @@ def _create_separate_parser(parser_factory): parser.add_argument('-c', '--codec', **OPT_CODEC) parser.add_argument('-b', '--birate', **OPT_BITRATE) parser.add_argument('-m', '--mwf', **OPT_MWF) - parser.add_argument('-w', '--chunk', **OPT_CHUNKED) + parser.add_argument('-B', '--stft-backend', **OPT_STFT_BACKEND) return parser diff --git a/spleeter/commands/separate.py b/spleeter/commands/separate.py index b3de4b6..2fac8aa 100644 --- a/spleeter/commands/separate.py +++ b/spleeter/commands/separate.py @@ -11,6 +11,8 @@ -i /path/to/audio1.wav /path/to/audio2.mp3 """ +import tensorflow as tf + from ..audio.adapter import get_audio_adapter from ..separator import Separator @@ -19,6 +21,12 @@ __author__ = 'Deezer Research' __license__ = 'MIT License' +def get_backend(backend): + if backend == "auto": + return "tensorflow" if tf.test.is_gpu_available() else "librosa" + return backend + + def entrypoint(arguments, params): """ Command entrypoint. @@ -29,13 +37,13 @@ def entrypoint(arguments, params): audio_adapter = get_audio_adapter(arguments.audio_adapter) separator = Separator( arguments.configuration, - arguments.MWF) + MWF=arguments.MWF, + stft_backend=get_backend(arguments.stft_backend)) for filename in arguments.inputs: separator.separate_to_file( filename, arguments.output_path, audio_adapter=audio_adapter, - chunk_duration=arguments.chunk_duration, offset=arguments.offset, duration=arguments.duration, codec=arguments.codec, diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py index 4d18eb5..4c3e200 100644 --- a/spleeter/model/__init__.py +++ b/spleeter/model/__init__.py @@ -18,6 +18,10 @@ __author__ = 'Deezer Research' __license__ = 'MIT License' + +placeholder = tf.compat.v1.placeholder + + def get_model_function(model_type): """ Get tensorflow function of the model to be applied to the input tensor. @@ -41,6 +45,74 @@ def get_model_function(model_type): return model_function +class InputProvider(object): + + def __init__(self, params): + self.params = params + + def get_input_dict_placeholders(self): + raise NotImplementedError() + + @property + def input_names(self): + raise NotImplementedError() + + def get_feed_dict(self, features, *args): + raise NotImplementedError() + + +class WaveformInputProvider(InputProvider): + + @property + def input_names(self): + return ["audio_id", "waveform"] + + def get_input_dict_placeholders(self): + shape = (None, self.params['n_channels']) + features = { + 'waveform': placeholder(tf.float32, shape=shape, name="waveform"), + 'audio_id': placeholder(tf.string, name="audio_id")} + return features + + def get_feed_dict(self, features, waveform, audio_id): + return {features["audio_id"]: audio_id, features["waveform"]: waveform} + + +class SpectralInputProvider(InputProvider): + + def __init__(self, params): + super().__init__(params) + self.stft_input_name = "{}_stft".format(self.params["mix_name"]) + + @property + def input_names(self): + return ["audio_id", self.stft_input_name] + + def get_input_dict_placeholders(self): + features = { + self.stft_input_name: placeholder(tf.complex64, + shape=(None, self.params["frame_length"]//2+1, + self.params['n_channels']), + name=self.stft_input_name), + 'audio_id': placeholder(tf.string, name="audio_id")} + return features + + def get_feed_dict(self, features, stft, audio_id): + return {features["audio_id"]: audio_id, features[self.stft_input_name]: stft} + + +class InputProviderFactory(object): + + @staticmethod + def get(params): + stft_backend = params["stft_backend"] + assert stft_backend in ("tensorflow", "librosa") + if stft_backend == "tensorflow": + return WaveformInputProvider(params) + else: + return SpectralInputProvider(params) + + class EstimatorSpecBuilder(object): """ A builder class that allows to builds a multitrack unet model estimator. The built model estimator has a different behaviour when @@ -57,9 +129,9 @@ class EstimatorSpecBuilder(object): >>> from spleeter.model import EstimatorSpecBuilder >>> builder = EstimatorSpecBuilder() - >>> builder.build_prediction_model() + >>> builder.build_predict_model() >>> builder.build_evaluation_model() - >>> builder.build_training_model() + >>> builder.build_train_model() >>> from spleeter.model import model_fn >>> estimator = tf.estimator.Estimator(model_fn=model_fn, ...) @@ -94,6 +166,7 @@ class EstimatorSpecBuilder(object): :param features: The input features for the estimator. :param params: Some hyperparameters as a dictionary. """ + self._features = features self._params = params # Get instrument name. @@ -106,7 +179,10 @@ class EstimatorSpecBuilder(object): self._frame_length = params['frame_length'] self._frame_step = params['frame_step'] - def _build_output_dict(self, input_tensor=None): + def include_stft_computations(self): + return self._params["stft_backend"] == "tensorflow" + + def _build_model_outputs(self): """ Created a batch_sizexTxFxn_channels input tensor containing mix magnitude spectrogram, then an output dict from it according to the selected model in internal parameters. @@ -114,8 +190,8 @@ class EstimatorSpecBuilder(object): :returns: Build output dict. :raise ValueError: If required model_type is not supported. """ - if input_tensor is None: - input_tensor = self._features[f'{self._mix_name}_spectrogram'] + + input_tensor = self.spectrogram_feature model = self._params.get('model', None) if model is not None: model_type = model.get('type', self.DEFAULT_MODEL) @@ -125,12 +201,12 @@ class EstimatorSpecBuilder(object): apply_model = get_model_function(model_type) except ModuleNotFoundError: raise ValueError(f'No model function {model_type} found') - return apply_model( + self._model_outputs = apply_model( input_tensor, self._instruments, self._params['model']['params']) - def _build_loss(self, output_dict, labels): + def _build_loss(self, labels): """ Construct tensorflow loss and metrics :param output_dict: dictionary of network outputs (key: instrument @@ -139,6 +215,7 @@ class EstimatorSpecBuilder(object): name, value: ground truth spectrogram of the instrument) :returns: tensorflow (loss, metrics) tuple. """ + output_dict = self.model_outputs loss_type = self._params.get('loss_type', self.L1_MASK) if loss_type == self.L1_MASK: losses = { @@ -178,30 +255,72 @@ class EstimatorSpecBuilder(object): return tf.compat.v1.train.GradientDescentOptimizer(rate) return tf.compat.v1.train.AdamOptimizer(rate) + @property + def instruments(self): + return self._instruments + + @property + def stft_name(self): + return f'{self._mix_name}_stft' + + @property + def spectrogram_name(self): + return f'{self._mix_name}_spectrogram' + def _build_stft_feature(self): """ Compute STFT of waveform and slice the STFT in segment with the right length to feed the network. """ - stft_feature = tf.transpose( - stft( - tf.transpose(self._features['waveform']), - self._frame_length, - self._frame_step, - window_fn=lambda frame_length, dtype: ( - hann_window(frame_length, periodic=True, dtype=dtype)), - pad_end=True), - perm=[1, 2, 0]) - self._features[f'{self._mix_name}_stft'] = stft_feature - self._features[f'{self._mix_name}_spectrogram'] = tf.abs( - pad_and_partition(stft_feature, self._T))[:, :, :self._F, :] - def get_stft_feature(self): - return self._features[f'{self._mix_name}_stft'] + stft_name = self.stft_name + spec_name = self.spectrogram_name - def get_spectrogram_feature(self): - return self._features[f'{self._mix_name}_spectrogram'] + if stft_name not in self._features: + stft_feature = tf.transpose( + stft( + tf.transpose(self._features['waveform']), + self._frame_length, + self._frame_step, + window_fn=lambda frame_length, dtype: ( + hann_window(frame_length, periodic=True, dtype=dtype)), + pad_end=True), + perm=[1, 2, 0]) + self._features[f'{self._mix_name}_stft'] = stft_feature + if spec_name not in self._features: + self._features[spec_name] = tf.abs( + pad_and_partition(self._features[stft_name], self._T))[:, :, :self._F, :] - def _inverse_stft(self, stft): + @property + def model_outputs(self): + if not hasattr(self, "_model_outputs"): + self._build_model_outputs() + return self._model_outputs + + @property + def outputs(self): + if not hasattr(self, "_outputs"): + self._build_outputs() + return self._outputs + + @property + def stft_feature(self): + if self.stft_name not in self._features: + self._build_stft_feature() + return self._features[self.stft_name] + + @property + def spectrogram_feature(self): + if self.spectrogram_name not in self._features: + self._build_stft_feature() + return self._features[self.spectrogram_name] + + @property + def masks(self): + if not hasattr(self, "_masks"): + self._build_masks() + return self._masks + + def _inverse_stft(self, stft, time_crop=None): """ Inverse and reshape the given STFT :param stft: input STFT @@ -215,20 +334,21 @@ class EstimatorSpecBuilder(object): hann_window(frame_length, periodic=True, dtype=dtype)) ) * self.WINDOW_COMPENSATION_FACTOR reshaped = tf.transpose(inversed) - return reshaped[:tf.shape(self._features['waveform'])[0], :] + if time_crop is None: + time_crop = tf.shape(self._features['waveform'])[0] + return reshaped[:time_crop, :] - def _build_mwf_output_waveform(self, output_dict): + def _build_mwf_output_waveform(self): """ Perform separation with multichannel Wiener Filtering using Norbert. Note: multichannel Wiener Filtering is not coded in Tensorflow and thus may be quite slow. - :param output_dict: dictionary of estimated spectrogram (key: instrument - name, value: estimated spectrogram of the instrument) :returns: dictionary of separated waveforms (key: instrument name, value: estimated waveform of the instrument) """ import norbert # pylint: disable=import-error - x = self._features[f'{self._mix_name}_stft'] + output_dict = self.model_outputs + x = self.stft_feature v = tf.stack( [ pad_and_reshape( @@ -272,11 +392,13 @@ class EstimatorSpecBuilder(object): mask_shape[-1])) else: raise ValueError(f'Invalid mask_extension parameter {extension}') - n_extra_row = (self._frame_length) // 2 + 1 - self._F + n_extra_row = self._frame_length // 2 + 1 - self._F extension = tf.tile(extension_row, [1, 1, n_extra_row, 1]) return tf.concat([mask, extension], axis=2) - def _build_masks(self, output_dict): + def _build_masks(self): + output_dict = self.model_outputs + stft_feature = self.stft_feature separation_exponent = self._params['separation_exponent'] output_sum = tf.reduce_sum( [e ** separation_exponent for e in output_dict.values()], @@ -297,21 +419,20 @@ class EstimatorSpecBuilder(object): axis=0) instrument_mask = tf.reshape(instrument_mask, new_shape) # Remove padded part (for mask having the same size as STFT); - stft_feature = self._features[f'{self._mix_name}_stft'] + instrument_mask = instrument_mask[ :tf.shape(stft_feature)[0], ...] out[instrument] = instrument_mask - return out + self._masks = out - def _build_masked_stft(self, mask_dict, input_stft=None): - if input_stft is None: - input_stft = self._features[f'{self._mix_name}_stft'] + def _build_masked_stft(self): + input_stft = self.stft_feature out = {} - for instrument, mask in mask_dict.items(): + for instrument, mask in self.masks.items(): out[instrument] = tf.cast(mask, dtype=tf.complex64) * input_stft return out - def _build_manual_output_waveform(self, output_dict): + def _build_manual_output_waveform(self, masked_stft): """ Perform ratio mask separation :param output_dict: dictionary of estimated spectrogram (key: instrument @@ -321,27 +442,34 @@ class EstimatorSpecBuilder(object): """ output_waveform = {} - masked_stft = self._build_masked_stft(self._build_masks(output_dict)) for instrument, stft_data in masked_stft.items(): output_waveform[instrument] = self._inverse_stft(stft_data) return output_waveform - def _build_output_waveform(self, output_dict): + def _build_output_waveform(self, masked_stft): """ Build output waveform from given output dict in order to be used in prediction context. Regarding of the configuration building method will be using MWF. - :param output_dict: Output dict to build output waveform from. :returns: Built output waveform. """ + if self._params.get('MWF', False): - output_waveform = self._build_mwf_output_waveform(output_dict) + output_waveform = self._build_mwf_output_waveform() else: - output_waveform = self._build_manual_output_waveform(output_dict) - if 'audio_id' in self._features: - output_waveform['audio_id'] = self._features['audio_id'] + output_waveform = self._build_manual_output_waveform(masked_stft) return output_waveform + def _build_outputs(self): + masked_stft = self._build_masked_stft() + if self.include_stft_computations(): + self._outputs = self._build_output_waveform(masked_stft) + else: + self._outputs = masked_stft + + if 'audio_id' in self._features: + self._outputs['audio_id'] = self._features['audio_id'] + def build_predict_model(self): """ Builder interface for creating model instance that aims to perform prediction / inference over given track. The output of such estimator @@ -350,12 +478,10 @@ class EstimatorSpecBuilder(object): :returns: An estimator for performing prediction. """ - self._build_stft_feature() - output_dict = self._build_output_dict() - output_waveform = self._build_output_waveform(output_dict) + return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, - predictions=output_waveform) + predictions=self.outputs) def build_evaluation_model(self, labels): """ Builder interface for creating model instance that aims to perform @@ -366,8 +492,7 @@ class EstimatorSpecBuilder(object): :param labels: Model labels. :returns: An estimator for performing model evaluation. """ - output_dict = self._build_output_dict() - loss, metrics = self._build_loss(output_dict, labels) + loss, metrics = self._build_loss(labels) return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.EVAL, loss=loss, @@ -382,8 +507,7 @@ class EstimatorSpecBuilder(object): :param labels: Model labels. :returns: An estimator for performing model training. """ - output_dict = self._build_output_dict() - loss, metrics = self._build_loss(output_dict, labels) + loss, metrics = self._build_loss(labels) optimizer = self._build_optimizer() train_operation = optimizer.minimize( loss=loss, diff --git a/spleeter/separator.py b/spleeter/separator.py index f7eac1b..1f4d177 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -20,14 +20,15 @@ from multiprocessing import Pool from os.path import basename, join, splitext import numpy as np import tensorflow as tf - +from librosa.core import stft, istft +from scipy.signal.windows import hann from . import SpleeterError from .audio.adapter import get_default_audio_adapter from .audio.convertor import to_stereo from .utils.configuration import load_configuration -from .utils.estimator import create_estimator, to_predictor, get_input_dict_placeholders, get_default_model_dir -from .model import EstimatorSpecBuilder +from .utils.estimator import create_estimator, to_predictor, get_default_model_dir +from .model import EstimatorSpecBuilder, InputProviderFactory __email__ = 'research@deezer.com' @@ -41,7 +42,7 @@ logger = logging.getLogger("spleeter") class Separator(object): """ A wrapper class for performing separation. """ - def __init__(self, params_descriptor, MWF=False, multiprocess=True): + def __init__(self, params_descriptor, MWF=False, stft_backend="auto", multiprocess=True): """ Default constructor. :param params_descriptor: Descriptor for TF params to be used. @@ -53,6 +54,7 @@ class Separator(object): self._predictor = None self._pool = Pool() if multiprocess else None self._tasks = [] + self._params["stft_backend"] = stft_backend def _get_predictor(self): """ Lazy loading access method for internal predictor instance. @@ -120,60 +122,41 @@ class Separator(object): assert chunk_size % d == 0 return chunk_size//d - def separate_chunked(self, waveform, sample_rate, chunk_max_duration): - chunk_size = self.get_valid_chunk_size(sample_rate, chunk_max_duration) - print(f"chunk size is {chunk_size}") - batch_size = self.get_batch_size_for_chunk_size(chunk_size) - print(f"batch size {batch_size}") - T, F = self._params["T"], self._params["F"] + def stft(self, waveform, inverse=False): + N = self._params["frame_length"] + H = self._params["frame_step"] + win = hann(N, sym=False) + fstft = istft if inverse else stft + win_len_arg = "win_length" if inverse else "n_fft" + s1 = fstft(waveform[:, 0], hop_length=H, window=win, center=False, **{win_len_arg: N}) + s2 = fstft(waveform[:, 1], hop_length=H, window=win, center=False, **{win_len_arg: N}) + s1 = np.expand_dims(s1.T, 2-inverse) + s2 = np.expand_dims(s2.T, 2-inverse) + return np.concatenate([s1, s2], axis=2-inverse) + + def separate_librosa(self, waveform, audio_id): out = {} - n_batches = (waveform.shape[0]+batch_size*T*F-1)//(batch_size*T*F) - print(f"{n_batches} to compute") - features = get_input_dict_placeholders(self._params) - spectrogram_input_t = tf.placeholder(tf.float32, shape=(None, T, F, 2), name="spectrogram_input") - istft_input_t = tf.placeholder(tf.complex64, shape=(None, F, 2), name="istft_input") - start_t = tf.placeholder(tf.int32, shape=(), name="start") - end_t = tf.placeholder(tf.int32, shape=(), name="end") + input_provider = InputProviderFactory.get(self._params) + features = input_provider.get_input_dict_placeholders() + builder = EstimatorSpecBuilder(features, self._params) latest_checkpoint = tf.train.latest_checkpoint(get_default_model_dir(self._params['model_dir'])) # TODO: fix the logic, build sometimes return, sometimes set attribute - builder._build_stft_feature() - stft_t = builder.get_stft_feature() - output_dict_t = builder._build_output_dict(input_tensor=spectrogram_input_t) - masked_stft_t = builder._build_masked_stft(builder._build_masks(output_dict_t), - input_stft=stft_t[start_t:end_t, :, :]) - output_waveform_t = builder._inverse_stft(istft_input_t) - waveform_t = features["waveform"] - masked_stfts = {} + outputs = builder.outputs + saver = tf.train.Saver() - + stft = self.stft(waveform) with tf.Session() as sess: - print("restoring weights {}".format(time())) saver.restore(sess, latest_checkpoint) - print("computing spectrogram {}".format(time())) - spectrogram, stft = sess.run([builder.get_spectrogram_feature(), stft_t], feed_dict={waveform_t: waveform}) - print(spectrogram.shape) - print(stft.shape) - for i in range(n_batches): - print("computing batch {} {}".format(i, time())) - start = i*batch_size - end = (i+1)*batch_size - tmp = sess.run(masked_stft_t, - feed_dict={spectrogram_input_t: spectrogram[start:end, ...], - start_t: start*T, end_t: end*T, stft_t: stft}) - for instrument, masked_stft in tmp.items(): - masked_stfts.setdefault(instrument, []).append(masked_stft) - - print("inverting spectrogram {}".format(time())) - for instrument, masked_stft in masked_stfts.items(): - out[instrument] = sess.run(output_waveform_t, {istft_input_t: np.concatenate(masked_stft, axis=0)}) - print("done separating {}".format(time())) + outputs = sess.run(outputs, feed_dict=input_provider.get_feed_dict(features, stft, audio_id)) + for inst in builder.instruments: + out[inst] = self.stft(outputs[inst], inverse=True) return out def separate_to_file( self, audio_descriptor, destination, - audio_adapter=get_default_audio_adapter(), chunk_duration=-1, + audio_adapter=get_default_audio_adapter(), offset=0, duration=600., codec='wav', bitrate='128k', filename_format='{filename}/{instrument}.{codec}', synchronous=True): @@ -198,15 +181,15 @@ class Separator(object): :param filename_format: (Optional) Filename format. :param synchronous: (Optional) True is should by synchronous. """ - print("loading audio {}".format(time())) waveform, sample_rate = audio_adapter.load( audio_descriptor, offset=offset, duration=duration, sample_rate=self._sample_rate) - print("done loading audio {}".format(time())) - sources = self.separate_chunked(waveform, sample_rate, chunk_duration) - print("saving to file {}".format(time())) + if self._params["stft_backend"] == "tensorflow": + sources = self.separate(waveform) + else: + sources = self.separate_librosa(waveform, audio_descriptor) self.save_to_file(sources, audio_descriptor, destination, filename_format, codec, audio_adapter, bitrate, synchronous) diff --git a/spleeter/utils/estimator.py b/spleeter/utils/estimator.py index 2b17e96..a9aa736 100644 --- a/spleeter/utils/estimator.py +++ b/spleeter/utils/estimator.py @@ -13,7 +13,7 @@ import tensorflow as tf from tensorflow.contrib import predictor # pylint: enable=import-error -from ..model import model_fn +from ..model import model_fn, InputProviderFactory from ..model.provider import get_default_model_provider # Default exporting directory for predictor. @@ -59,14 +59,6 @@ def create_estimator(params, MWF): return estimator -def get_input_dict_placeholders(params): - shape = (None, params['n_channels']) - features = { - 'waveform': tf.compat.v1.placeholder(tf.float32, shape=shape, name="waveform"), - 'audio_id': tf.compat.v1.placeholder(tf.string, name="audio_id")} - return features - - def to_predictor(estimator, directory=DEFAULT_EXPORT_DIRECTORY): """ Exports given estimator as predictor into the given directory and returns associated tf.predictor instance. @@ -74,8 +66,10 @@ def to_predictor(estimator, directory=DEFAULT_EXPORT_DIRECTORY): :param estimator: Estimator to export. :param directory: (Optional) path to write exported model into. """ + + input_provider = InputProviderFactory.get(estimator.params) def receiver(): - features = get_input_dict_placeholders(estimator.params) + features = input_provider.get_input_dict_placeholders() return tf.estimator.export.ServingInputReceiver(features, features) estimator.export_saved_model(directory, receiver) From 31f823aaa4b671ea55328ccf8551f2febe2d9aa9 Mon Sep 17 00:00:00 2001 From: akhlif Date: Wed, 26 Feb 2020 17:51:28 +0100 Subject: [PATCH 08/15] removing useless code --- spleeter/separator.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/spleeter/separator.py b/spleeter/separator.py index 1f4d177..87accd5 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -98,30 +98,6 @@ class Separator(object): prediction.pop('audio_id') return prediction - def get_valid_chunk_size(self, sample_rate: int, chunk_max_duration: float) -> int: - """ - Given a sample rate, and a maximal duration that a chunk can represent, return the maximum chunk - size in samples. The chunk size must be a non-zero multiple of T (temporal dimension of the input spectrogram) - times F (number of frequency bins in the input spectrogram). If no such value exist, we return T*F. - - :param sample_rate: sample rate of the pcm data - :param chunk_max_duration: maximal duration in seconds of a chunk - :return: highest non-zero chunk size of duration less than chunk_max_duration or minimal valid chunk size. - """ - assert chunk_max_duration > 0 - chunk_size = chunk_max_duration * sample_rate - min_sample_size = self._params["T"] * self._params["F"] - if chunk_size < min_sample_size: - min_duration = min_sample_size / sample_rate - logger.warning("chunk_duration must be at least {:.2f} seconds. Ignoring parameter".format(min_duration)) - chunk_size = min_sample_size - return min_sample_size*int(chunk_size/min_sample_size) - - def get_batch_size_for_chunk_size(self, chunk_size): - d = self._params["T"] * self._params["F"] - assert chunk_size % d == 0 - return chunk_size//d - def stft(self, waveform, inverse=False): N = self._params["frame_length"] H = self._params["frame_step"] From 6001ae12a97583481b0651ce0cc6a5a6984ec062 Mon Sep 17 00:00:00 2001 From: akhlif Date: Thu, 27 Feb 2020 11:05:06 +0100 Subject: [PATCH 09/15] Fixing the stft/istft computations --- spleeter/separator.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/spleeter/separator.py b/spleeter/separator.py index 87accd5..73a1a89 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -98,14 +98,23 @@ class Separator(object): prediction.pop('audio_id') return prediction - def stft(self, waveform, inverse=False): + def stft(self, data, inverse=False): + """ + Single entrypoint for both stft and istft. This computes stft and istft with librosa on stereo data. The two + channels are processed separately and are concatenated together in the result. The expected input formats are: + (n_samples, 2) for stft and (T, F, 2) for istft. + :param data: np.array with either the waveform or the complex spectrogram depending on the parameter inverse + :param inverse: should a stft or an istft be computed. + :return: Stereo data as numpy array for the transform. The channels are stored in the last dimension + """ N = self._params["frame_length"] H = self._params["frame_step"] win = hann(N, sym=False) fstft = istft if inverse else stft - win_len_arg = "win_length" if inverse else "n_fft" - s1 = fstft(waveform[:, 0], hop_length=H, window=win, center=False, **{win_len_arg: N}) - s2 = fstft(waveform[:, 1], hop_length=H, window=win, center=False, **{win_len_arg: N}) + win_len_arg = {"win_length": None} if inverse else {"n_fft": N} + dl, dr = (data[:, :, 0].T, data[:, :, 1].T) if inverse else (data[:, 0], data[:, 1]) + s1 = fstft(dl, hop_length=H, window=win, center=False, **win_len_arg) + s2 = fstft(dr, hop_length=H, window=win, center=False, **win_len_arg) s1 = np.expand_dims(s1.T, 2-inverse) s2 = np.expand_dims(s2.T, 2-inverse) return np.concatenate([s1, s2], axis=2-inverse) @@ -127,7 +136,7 @@ class Separator(object): saver.restore(sess, latest_checkpoint) outputs = sess.run(outputs, feed_dict=input_provider.get_feed_dict(features, stft, audio_id)) for inst in builder.instruments: - out[inst] = self.stft(outputs[inst], inverse=True) + out[inst] = self.stft(outputs[inst], inverse=True)[:waveform.shape[0], :] return out def separate_to_file( From d177525ea7ed73f816c75a96708bc470d3eb38ef Mon Sep 17 00:00:00 2001 From: akhlif Date: Thu, 27 Feb 2020 14:13:59 +0100 Subject: [PATCH 10/15] Moving get_backend to separator --- spleeter/commands/separate.py | 9 +-------- spleeter/model/__init__.py | 19 ++++++++++++++----- spleeter/separator.py | 11 ++++++++++- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/spleeter/commands/separate.py b/spleeter/commands/separate.py index 2fac8aa..158190c 100644 --- a/spleeter/commands/separate.py +++ b/spleeter/commands/separate.py @@ -11,8 +11,6 @@ -i /path/to/audio1.wav /path/to/audio2.mp3 """ -import tensorflow as tf - from ..audio.adapter import get_audio_adapter from ..separator import Separator @@ -21,11 +19,6 @@ __author__ = 'Deezer Research' __license__ = 'MIT License' -def get_backend(backend): - if backend == "auto": - return "tensorflow" if tf.test.is_gpu_available() else "librosa" - return backend - def entrypoint(arguments, params): """ Command entrypoint. @@ -38,7 +31,7 @@ def entrypoint(arguments, params): separator = Separator( arguments.configuration, MWF=arguments.MWF, - stft_backend=get_backend(arguments.stft_backend)) + stft_backend=arguments.stft_backend) for filename in arguments.inputs: separator.separate_to_file( filename, diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py index 4c3e200..b99a9dd 100644 --- a/spleeter/model/__init__.py +++ b/spleeter/model/__init__.py @@ -320,6 +320,12 @@ class EstimatorSpecBuilder(object): self._build_masks() return self._masks + @property + def masked_stfts(self): + if not hasattr(self, "_masked_stfts"): + self._build_masked_stfts() + return self._masked_stfts + def _inverse_stft(self, stft, time_crop=None): """ Inverse and reshape the given STFT @@ -397,6 +403,10 @@ class EstimatorSpecBuilder(object): return tf.concat([mask, extension], axis=2) def _build_masks(self): + """ + Compute masks from the output spectrograms of the model. + :return: + """ output_dict = self.model_outputs stft_feature = self.stft_feature separation_exponent = self._params['separation_exponent'] @@ -425,12 +435,12 @@ class EstimatorSpecBuilder(object): out[instrument] = instrument_mask self._masks = out - def _build_masked_stft(self): + def _build_masked_stfts(self): input_stft = self.stft_feature out = {} for instrument, mask in self.masks.items(): out[instrument] = tf.cast(mask, dtype=tf.complex64) * input_stft - return out + self._masked_stfts = out def _build_manual_output_waveform(self, masked_stft): """ Perform ratio mask separation @@ -461,11 +471,10 @@ class EstimatorSpecBuilder(object): return output_waveform def _build_outputs(self): - masked_stft = self._build_masked_stft() if self.include_stft_computations(): - self._outputs = self._build_output_waveform(masked_stft) + self._outputs = self._build_output_waveform(self.masked_stfts) else: - self._outputs = masked_stft + self._outputs = self.masked_stfts if 'audio_id' in self._features: self._outputs['audio_id'] = self._features['audio_id'] diff --git a/spleeter/separator.py b/spleeter/separator.py index 73a1a89..668b59d 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -39,6 +39,14 @@ __license__ = 'MIT License' logger = logging.getLogger("spleeter") + +def get_backend(backend): + assert backend in ["auto", "tensorflow", "librosa"] + if backend == "auto": + return "tensorflow" if tf.test.is_gpu_available() else "librosa" + return backend + + class Separator(object): """ A wrapper class for performing separation. """ @@ -48,13 +56,14 @@ class Separator(object): :param params_descriptor: Descriptor for TF params to be used. :param MWF: (Optional) True if MWF should be used, False otherwise. """ + self._params = load_configuration(params_descriptor) self._sample_rate = self._params['sample_rate'] self._MWF = MWF self._predictor = None self._pool = Pool() if multiprocess else None self._tasks = [] - self._params["stft_backend"] = stft_backend + self._params["stft_backend"] = get_backend(stft_backend) def _get_predictor(self): """ Lazy loading access method for internal predictor instance. From 922fcd85bbde235aac58098e7529f0466fc79fa3 Mon Sep 17 00:00:00 2001 From: akhlif Date: Thu, 27 Feb 2020 14:37:02 +0100 Subject: [PATCH 11/15] add librosa dependency --- requirements.txt | 3 ++- setup.py | 3 ++- spleeter/model/__init__.py | 9 ++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8e1f7be..cf289ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ setuptools>=41.0.0 pandas==0.25.1 tensorflow==1.14.0 ffmpeg-python -norbert==0.2.1 \ No newline at end of file +norbert==0.2.1 +librosa==0.7.2 \ No newline at end of file diff --git a/setup.py b/setup.py index b70dcdc..f89042f 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ __license__ = 'MIT License' # Default project values. project_name = 'spleeter' -project_version = '1.4.9' +project_version = '1.5.0' tensorflow_dependency = 'tensorflow' tensorflow_version = '1.14.0' here = path.abspath(path.dirname(__file__)) @@ -56,6 +56,7 @@ setup( 'pandas==0.25.1', 'requests', 'setuptools>=41.0.0', + 'librosa==0.7.2', '{}=={}'.format(tensorflow_dependency, tensorflow_version), ], extras_require={ diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py index b99a9dd..6c2fe3a 100644 --- a/spleeter/model/__init__.py +++ b/spleeter/model/__init__.py @@ -18,7 +18,6 @@ __author__ = 'Deezer Research' __license__ = 'MIT License' - placeholder = tf.compat.v1.placeholder @@ -326,14 +325,14 @@ class EstimatorSpecBuilder(object): self._build_masked_stfts() return self._masked_stfts - def _inverse_stft(self, stft, time_crop=None): + def _inverse_stft(self, stft_t, time_crop=None): """ Inverse and reshape the given STFT - :param stft: input STFT + :param stft_t: input STFT :returns: inverse STFT (waveform) """ inversed = inverse_stft( - tf.transpose(stft, perm=[2, 0, 1]), + tf.transpose(stft_t, perm=[2, 0, 1]), self._frame_length, self._frame_step, window_fn=lambda frame_length, dtype: ( @@ -419,7 +418,7 @@ class EstimatorSpecBuilder(object): output = output_dict[f'{instrument}_spectrogram'] # Compute mask with the model. instrument_mask = (output ** separation_exponent - + (self.EPSILON / len(output_dict))) / output_sum + + (self.EPSILON / len(output_dict))) / output_sum # Extend mask; instrument_mask = self._extend_mask(instrument_mask) # Stack back mask. From 3cba6985f410750f9704303668e4093dde29f7a3 Mon Sep 17 00:00:00 2001 From: akhlif Date: Thu, 27 Feb 2020 15:38:46 +0100 Subject: [PATCH 12/15] Updating tests to test for librosa backend --- spleeter/model/__init__.py | 2 +- spleeter/separator.py | 23 ++++++++++++++--------- tests/test_separator.py | 37 +++++++++++++++++++++---------------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py index 6c2fe3a..531f84c 100644 --- a/spleeter/model/__init__.py +++ b/spleeter/model/__init__.py @@ -105,7 +105,7 @@ class InputProviderFactory(object): @staticmethod def get(params): stft_backend = params["stft_backend"] - assert stft_backend in ("tensorflow", "librosa") + assert stft_backend in ("tensorflow", "librosa"), "Unexpected backend {}".format(stft_backend) if stft_backend == "tensorflow": return WaveformInputProvider(params) else: diff --git a/spleeter/separator.py b/spleeter/separator.py index 668b59d..174c73f 100644 --- a/spleeter/separator.py +++ b/spleeter/separator.py @@ -85,7 +85,7 @@ class Separator(object): task.get() task.wait(timeout=timeout) - def separate(self, waveform): + def separate_tensorflow(self, waveform, audio_descriptor): """ Performs source separation over the given waveform. The separation is performed synchronously but the result @@ -103,11 +103,11 @@ class Separator(object): predictor = self._get_predictor() prediction = predictor({ 'waveform': waveform, - 'audio_id': ''}) + 'audio_id': audio_descriptor}) prediction.pop('audio_id') return prediction - def stft(self, data, inverse=False): + def stft(self, data, inverse=False, length=None): """ Single entrypoint for both stft and istft. This computes stft and istft with librosa on stereo data. The two channels are processed separately and are concatenated together in the result. The expected input formats are: @@ -116,11 +116,13 @@ class Separator(object): :param inverse: should a stft or an istft be computed. :return: Stereo data as numpy array for the transform. The channels are stored in the last dimension """ + assert not (inverse and length is None) + data = np.asfortranarray(data) N = self._params["frame_length"] H = self._params["frame_step"] win = hann(N, sym=False) fstft = istft if inverse else stft - win_len_arg = {"win_length": None} if inverse else {"n_fft": N} + win_len_arg = {"win_length": None, "length": length} if inverse else {"n_fft": N} dl, dr = (data[:, :, 0].T, data[:, :, 1].T) if inverse else (data[:, 0], data[:, 1]) s1 = fstft(dl, hop_length=H, window=win, center=False, **win_len_arg) s2 = fstft(dr, hop_length=H, window=win, center=False, **win_len_arg) @@ -145,9 +147,15 @@ class Separator(object): saver.restore(sess, latest_checkpoint) outputs = sess.run(outputs, feed_dict=input_provider.get_feed_dict(features, stft, audio_id)) for inst in builder.instruments: - out[inst] = self.stft(outputs[inst], inverse=True)[:waveform.shape[0], :] + out[inst] = self.stft(outputs[inst], inverse=True, length=waveform.shape[0]) return out + def separate(self, waveform, audio_descriptor): + if self._params["stft_backend"] == "tensorflow": + return self.separate_tensorflow(waveform, audio_descriptor) + else: + return self.separate_librosa(waveform, audio_descriptor) + def separate_to_file( self, audio_descriptor, destination, audio_adapter=get_default_audio_adapter(), @@ -180,10 +188,7 @@ class Separator(object): offset=offset, duration=duration, sample_rate=self._sample_rate) - if self._params["stft_backend"] == "tensorflow": - sources = self.separate(waveform) - else: - sources = self.separate_librosa(waveform, audio_descriptor) + sources = self.separate(waveform, audio_descriptor) self.save_to_file(sources, audio_descriptor, destination, filename_format, codec, audio_adapter, bitrate, synchronous) diff --git a/tests/test_separator.py b/tests/test_separator.py index 9235731..271fdfb 100644 --- a/tests/test_separator.py +++ b/tests/test_separator.py @@ -13,6 +13,7 @@ from os.path import splitext, basename, exists, join from tempfile import TemporaryDirectory import pytest +import numpy as np from spleeter import SpleeterError from spleeter.audio.adapter import get_default_audio_adapter @@ -21,34 +22,38 @@ from spleeter.separator import Separator TEST_AUDIO_DESCRIPTOR = 'audio_example.mp3' TEST_AUDIO_BASENAME = splitext(basename(TEST_AUDIO_DESCRIPTOR))[0] TEST_CONFIGURATIONS = [ - ('spleeter:2stems', ('vocals', 'accompaniment')), - ('spleeter:4stems', ('vocals', 'drums', 'bass', 'other')), - ('spleeter:5stems', ('vocals', 'drums', 'bass', 'piano', 'other')) + ('spleeter:2stems', ('vocals', 'accompaniment'), 'tensorflow'), + ('spleeter:4stems', ('vocals', 'drums', 'bass', 'other'), 'tensorflow'), + ('spleeter:5stems', ('vocals', 'drums', 'bass', 'piano', 'other'), 'tensorflow'), + ('spleeter:2stems', ('vocals', 'accompaniment'), 'librosa'), + ('spleeter:4stems', ('vocals', 'drums', 'bass', 'other'), 'librosa'), + ('spleeter:5stems', ('vocals', 'drums', 'bass', 'piano', 'other'), 'librosa') ] -@pytest.mark.parametrize('configuration, instruments', TEST_CONFIGURATIONS) -def test_separate(configuration, instruments): +@pytest.mark.parametrize('configuration, instruments, backend', TEST_CONFIGURATIONS) +def test_separate(configuration, instruments, backend): """ Test separation from raw data. """ adapter = get_default_audio_adapter() waveform, _ = adapter.load(TEST_AUDIO_DESCRIPTOR) - separator = Separator(configuration) - prediction = separator.separate(waveform) + separator = Separator(configuration, stft_backend=backend) + prediction = separator.separate(waveform, TEST_AUDIO_DESCRIPTOR) assert len(prediction) == len(instruments) for instrument in instruments: assert instrument in prediction for instrument in instruments: track = prediction[instrument] - assert not (waveform == track).all() + assert waveform.shape == track.shape + assert not np.allclose(waveform, track) for compared in instruments: if instrument != compared: - assert not (track == prediction[compared]).all() + assert not np.allclose(track, prediction[compared]) -@pytest.mark.parametrize('configuration, instruments', TEST_CONFIGURATIONS) -def test_separate_to_file(configuration, instruments): +@pytest.mark.parametrize('configuration, instruments, backend', TEST_CONFIGURATIONS) +def test_separate_to_file(configuration, instruments, backend): """ Test file based separation. """ - separator = Separator(configuration) + separator = Separator(configuration, stft_backend=backend) with TemporaryDirectory() as directory: separator.separate_to_file( TEST_AUDIO_DESCRIPTOR, @@ -59,10 +64,10 @@ def test_separate_to_file(configuration, instruments): '{}/{}.wav'.format(TEST_AUDIO_BASENAME, instrument))) -@pytest.mark.parametrize('configuration, instruments', TEST_CONFIGURATIONS) -def test_filename_format(configuration, instruments): +@pytest.mark.parametrize('configuration, instruments, backend', TEST_CONFIGURATIONS) +def test_filename_format(configuration, instruments, backend): """ Test custom filename format. """ - separator = Separator(configuration) + separator = Separator(configuration, stft_backend=backend) with TemporaryDirectory() as directory: separator.separate_to_file( TEST_AUDIO_DESCRIPTOR, @@ -74,7 +79,7 @@ def test_filename_format(configuration, instruments): 'export/{}/{}.wav'.format(TEST_AUDIO_BASENAME, instrument))) -def test_filename_confilct(): +def test_filename_conflict(): """ Test error handling with static pattern. """ separator = Separator(TEST_CONFIGURATIONS[0][0]) with TemporaryDirectory() as directory: From 6b713edfb9b23b6c40a64950be654052d3ee7b3b Mon Sep 17 00:00:00 2001 From: romi1502 Date: Fri, 6 Mar 2020 18:11:36 +0100 Subject: [PATCH 13/15] Added paper for JOSS submission --- paper.bib | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ paper.md | 96 ++++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 paper.bib create mode 100644 paper.md diff --git a/paper.bib b/paper.bib new file mode 100644 index 0000000..819c309 --- /dev/null +++ b/paper.bib @@ -0,0 +1,151 @@ +% bibtex + +@article{SISEC18, + author = {{St{\"o}ter}, Fabian-Robert and {Liutkus}, Antoine and {Ito}, Nobutaka}, + title = "{The 2018 Signal Separation Evaluation Campaign}", + journal = {arXiv e-prints}, + keywords = {Electrical Engineering and Systems Science - Audio and Speech Processing, Computer Science - Sound}, + year = "2018", + month = "Apr", + eid = {arXiv:1804.06267}, + pages = {arXiv:1804.06267}, +archivePrefix = {arXiv}, + eprint = {1804.06267}, + primaryClass = {eess.AS}, + adsurl = {https://ui.adsabs.harvard.edu/abs/2018arXiv180406267S}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + +@misc{spleeter2019, + title={Spleeter: A Fast And State-of-the Art Music Source Separation Tool With Pre-trained Models}, + author={Romain Hennequin and Anis Khlif and Felix Voituret and Manuel Moussallam}, + howpublished={Late-Breaking/Demo ISMIR 2019}, + month={November}, + note={Deezer Research}, + year={2019} +} + +@inproceedings{unet2017, + title={Singing voice separation with deep U-Net convolutional networks}, + author={Jansson, Andreas and Humphrey, Eric J. and Montecchio, Nicola and Bittner, Rachel and Kumar, Aparna and Weyde, Tillman}, + booktitle={Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)}, + pages={323--332}, + year={2017} +} + +@inproceedings{deezerICASSP2019, +author={Laure {Pr\'etet} and Romain {Hennequin} and Jimena {Royo-Letelier} and Andrea {Vaglio}}, +booktitle={ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, +title={Singing Voice Separation: A Study on Training Data}, +year={2019}, +volume={}, +number={}, +pages={506-510}, +keywords={feature extraction;source separation;speech processing;supervised training;separation quality;data augmentation;singing voice separation systems;singing voice separation algorithms;separation diversity;source separation;supervised learning;training data;data augmentation}, +doi={10.1109/ICASSP.2019.8683555}, +ISSN={}, +month={May},} + + +@misc{Norbert, + author = {Antoine Liutkus and + Fabian-Robert St{\"o}ter}, + title = {sigsep/norbert: First official Norbert release}, + month = jul, + year = 2019, + doi = {10.5281/zenodo.3269749}, + url = {https://doi.org/10.5281/zenodo.3269749} +} + +@ARTICLE{separation_metrics, +author={Emmanuel {Vincent} and Remi {Gribonval} and Cedric {Fevotte}}, +journal={IEEE Transactions on Audio, Speech, and Language Processing}, +title={Performance measurement in blind audio source separation}, +year={2006}, +volume={14}, +number={4}, +pages={1462-1469}, +keywords={audio signal processing;blind source separation;distortion;time-varying filters;blind audio source separation;distortions;time-invariant gains;time-varying filters;source estimation;interference;additive noise;algorithmic artifacts;Source separation;Data mining;Filters;Additive noise;Microphones;Distortion measurement;Energy measurement;Independent component analysis;Interference;Image analysis;Audio source separation;evaluation;measure;performance;quality}, +doi={10.1109/TSA.2005.858005}, +ISSN={}, +month={July},} + +@misc{musdb18, + author = {Rafii, Zafar and + Liutkus, Antoine and + Fabian-Robert St{\"o}ter and + Mimilakis, Stylianos Ioannis and + Bittner, Rachel}, + title = {The {MUSDB18} corpus for music separation}, + month = dec, + year = 2017, + doi = {10.5281/zenodo.1117372}, + url = {https://doi.org/10.5281/zenodo.1117372} +} + + +@misc{tensorflow2015-whitepaper, +title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems}, +url={https://www.tensorflow.org/}, +note={Software available from tensorflow.org}, +author={ + Abadi, Mart{\'{\i}}n et al.}, + year={2015}, +} + +@article{2019arXiv190611139L, + author = {{Lee}, Kyungyun and {Nam}, Juhan}, + title = "{Learning a Joint Embedding Space of Monophonic and Mixed Music Signals for Singing Voice}", + journal = {arXiv e-prints}, + keywords = {Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing}, + year = "2019", + month = "Jun", + eid = {arXiv:1906.11139}, + pages = {arXiv:1906.11139}, +archivePrefix = {arXiv}, + eprint = {1906.11139}, + primaryClass = {cs.SD}, + adsurl = {https://ui.adsabs.harvard.edu/abs/2019arXiv190611139L}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + +@article{Adam, + author = {{Kingma}, Diederik P. and {Ba}, Jimmy}, + title = "{Adam: A Method for Stochastic Optimization}", + journal = {arXiv e-prints}, + keywords = {Computer Science - Machine Learning}, + year = "2014", + month = "Dec", + eid = {arXiv:1412.6980}, + pages = {arXiv:1412.6980}, +archivePrefix = {arXiv}, + eprint = {1412.6980}, + primaryClass = {cs.LG}, + adsurl = {https://ui.adsabs.harvard.edu/abs/2014arXiv1412.6980K}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + +@article{Open-Unmix, + author={Fabian-Robert St\"{o}ter and Stefan Uhlich and Antoine Liutkus and Yuki Mitsufuji}, + title={Open-Unmix - A Reference Implementation for Music Source Separation}, + journal={Journal of Open Source Software}, + year=2019, + doi = {10.21105/joss.01667}, + url = {https://doi.org/10.21105/joss.01667} +} + +@misc{spleeter, + author={Romain Hennequin and Anis Khlif and Felix Voituret and Manuel Moussallam}, + title={Spleeter}, + year=2019, + url = {https://www.github.com/deezer/spleeter} +} + +@misc{demucs, + title={Music Source Separation in the Waveform Domain}, + author={Alexandre Défossez and Nicolas Usunier and Léon Bottou and Francis Bach}, + year={2019}, + eprint={1911.13254}, + archivePrefix={arXiv}, + primaryClass={cs.SD} +} \ No newline at end of file diff --git a/paper.md b/paper.md new file mode 100644 index 0000000..0ffb5cd --- /dev/null +++ b/paper.md @@ -0,0 +1,96 @@ +--- +title: 'Spleeter: a fast and state-of-the art music source separation tool with pre-trained models' +tags: + - Python + - musical signal processing + - source separation + - vocal isolation +authors: + - name: Romain Hennequin + orcid: 0000-0001-8158-5562 + affiliation: 1 + - name: Anis Khlif + affiliation: 1 + - name: Felix Voituret + affiliation: 1 + - name: Manuel Moussallam + orcid: 0000-0003-0886-5423 + affiliation: 1 +affiliations: + - name: Deezer Research, Paris + index: 1 +date: 04 March 2020 +bibliography: paper.bib + +--- + +## Summary + +We present and release a new tool for music source separation with pre-trained models called Spleeter. Spleeter was designed with ease of use, separation performance and speed in mind. Spleeter is based on Tensorflow [@tensorflow2015-whitepaper] and makes it possible to: + +- split music audio files into several stems with a single command line using pre-trained models. A music audio file can be separated into $2$ stems (vocals and accompaniments), $4$ stems (vocals, drums, bass and other) or $5$ stems (vocals, drums, bass, piano and other). +- train source separation models or fine-tune pre-trained ones with Tensorflow (provided you have a dataset of isolated sources). + +The performance of the pre-trained models are very close to the published state of the art and is one of the best performing $4$ stems separation model on the common musdb18 benchmark [@musdb18] to be publicly released. Spleeter is also very fast as it can separate a mix audio file into $4$ stems $100$ times faster than real-time (we note, though, that the model cannot be applied in real-time as it needs buffering) on a single Graphics Processing Unit (GPU) using the pre-trained $4$-stems model. + +## Purpose + +We release Spleeter with pre-trained state-of-the-art models in order to help the Music Information Retrieval (MIR) research community leverage the power of source separation in various MIR tasks, such as vocal lyrics analysis from audio (audio/lyrics alignement, lyrics transcription...), music transcription (chord transcription, drums transcription, bass transcription, chord estimation, beat tracking), singer identification, any type of multilabel classification (mood/genre...), vocal melody extraction or cover detection. +We believe that source separation has reached a level of maturity that makes it worth consideration for these tasks and that specific features computed from isolated vocals, drums or bass may help increase performances, especially in low data availability scenarios (small datasets, limited annotation availability) for which supervised learning might be difficult. +Spleeter also makes it possible to fine tune the provided state-of-the-art models in order to adapt the system to a specific use-case. +Finally, having an available source separation tool such as Spleeter will allow researchers to compare performances of their new models to a state-of-the-art one on their own private datasets instead of musdb18, which is usually the only used dataset for reporting separation performances for unreleased models. +Note that we cannot release the training data for copyright reasons, and thus, sharing pre-trained models were the only way to make these results available to the community. + +## Implementation details + +Spleeter contains pre-trained models for: + +- vocals/accompaniment separation. +- $4$ stems separation as in SiSec [@SISEC18] (vocals, bass, drums and other). +- $5$ stems separation with an extra piano stem (vocals, bass, drums, piano and other). It is, to the authors knowledge, the first released model to perform such a separation. + +The pre-trained models are U-nets [@unet2017] and follows similar specifications as in [@deezerICASSP2019]. The U-net is a encoder/decoder Convolutional Neural Network (CNN) architecture with skip connections. We used $12$-layer U-nets ($6$ layers for the encoder and $6$ for the decoder). A U-net is used for estimating a soft mask for each source (stem). Training loss is a $L_1$-norm between masked input mix spectrograms and source target spectrograms. The models were trained on Deezer internal datasets (noteworthily the Bean dataset that was used in [@deezerICASSP2019]) using Adam [@Adam]. Training time took approximately a full week on a single GPU. Separation is then done from estimated source spectrograms using soft masking or multi-channel Wiener filtering. + +Training and inference is implemented in Tensorflow which makes it possible to run the code on Central Processing Unit (CPU) or GPU. + +## Speed + +As the whole separation pipeline can be run on a GPU and the model is based on a CNN, computations are efficiently parallelized and model inference is very fast. For instance, Spleeter is able to separate the whole musdb18 test dataset (about $3$ hours and $27$ minutes of audio) into $4$ stems in less than $2$ minutes, including model loading time (about $15$ seconds), and audio wav files export, using a single GeForce RTX 2080 GPU, and a double Intel Xeon Gold 6134 CPU @ 3.20GHz (CPU is used for mix files loading and stem files export only). In this setup, Spleeter is able to process $100$ seconds of stereo audio in less than $1$ second, which makes it very useful for efficiently processing large datasets. + +## Separation performances + +The models compete with the state of the art on the standard musdb18 dataset [@musdb18] while it was not trained, validated or optimized in any way with musdb18 data. We report results in terms of standard source separation metrics [@separation_metrics], namely Signal to Distortion Ratio (SDR), Signal to Artifacts Ratio (SAR), Signal to Interference Ratio (SIR) and source Image to Spatial distortion Ratio (ISR), are presented in the following table compared to Open-Unmix [@Open-Unmix] and Demucs [@demucs] (only SDR are reported for Demucs since other metrics are not available in the paper) which are, to the authors knowledge, the only released system that perform near state-of-the-art performances. +We present results for soft masking and for multi-channel Wiener filtering (applied using Norbert [@Norbert]). As can be seen, for most metrics Spleeter is competitive with Open-Unmix and especially on SDR for all instruments, and is almost on par with Demucs. + + +| |Spleeter Mask |Spleeter MWF |Open-Unmix |Demucs| +|-----------|---------------|---------------|-----------|------| +| Vocals SDR|6.55 |6.86 |6.32 |7.05  | +| Vocals SIR|15.19 |15.86 |13.33 |  | +| Vocals SAR|6.44 |6.99 |6.52 |  | +| Vocals ISR|12.01 |11.95 |11.93 |  | +| Bass SDR |5.10 |5.51 |5.23 |6.70  | +| Bass SIR |10.01 |10.30 |10.93 |  | +| Bass SAR |5.15 |5.96 |6.34 |  | +| Bass ISR |9.18 |9.61 |9.23 |  | +| Drums SDR |5.93 |6.71 |5.73 |7.08  | +| Drums SIR |12.24 |13.67 |11.12 |  | +| Drums SAR |5.78 |6.54 |6.02 |  | +| Drums ISR |10.50 |10.69 |10.51 |  | +| Other SDR |4.24 |4.55 |4.02 |4.47  | +| Other SIR |7.86 |8.16 |6.59 |  | +| Other SAR |4.63 |4.88 |4.74 |  | +| Other ISR |9.83 |9.87 |9.31 |  | + + +Spleeter [@spleeter] source code and pre-trained models are available on [github](https://www.github.com/deezer/spleeter) and distributed under a MIT license. This repository will eventually be used for releasing other models with improved performances or models separating into more than $5$ stems in the future. + +## Distribution + +Spleeter is available as a standalone Python package, and also provided as a [conda](https://github.com/conda-forge/spleeter-feedstock) recipe and self-contained [Dockers](https://hub.docker.com/r/researchdeezer/spleeter) which makes it usable as is on various platforms. + +## Acknowledgements + +We acknowledge contributions from Laure Pretet who trained first models and wrote the first piece of code that lead to Spleeter. + +## References From 2365a9b37fa27782f48a59bd7689f92c92cdec5d Mon Sep 17 00:00:00 2001 From: akhlif Date: Fri, 20 Mar 2020 15:39:11 +0100 Subject: [PATCH 14/15] Removing duplicate logs --- spleeter/commands/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spleeter/commands/__init__.py b/spleeter/commands/__init__.py index 452b3a6..995e866 100644 --- a/spleeter/commands/__init__.py +++ b/spleeter/commands/__init__.py @@ -13,7 +13,6 @@ __email__ = 'research@deezer.com' __author__ = 'Deezer Research' __license__ = 'MIT License' -logging.basicConfig() # -i opt specification (separate). From 6539bf779b0fbe825718be140cb44138b40326ae Mon Sep 17 00:00:00 2001 From: akhlif Date: Fri, 20 Mar 2020 15:40:55 +0100 Subject: [PATCH 15/15] Bumping tensorflow version to do 1 PR merge instead of two --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index cf289ee..7f3bbf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ importlib_resources; python_version<'3.7' requests setuptools>=41.0.0 pandas==0.25.1 -tensorflow==1.14.0 +tensorflow==1.15.0 ffmpeg-python norbert==0.2.1 librosa==0.7.2 \ No newline at end of file diff --git a/setup.py b/setup.py index f89042f..569ef49 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ __license__ = 'MIT License' project_name = 'spleeter' project_version = '1.5.0' tensorflow_dependency = 'tensorflow' -tensorflow_version = '1.14.0' +tensorflow_version = '1.15.0' here = path.abspath(path.dirname(__file__)) readme_path = path.join(here, 'README.md') with open(readme_path, 'r') as stream: