🎉 start typing integration

2026-01-31 14:58:23 +00:00 · 2020-12-07 14:38:37 +01:00
parent 49866b82a6
commit 194a50e7cf
7 changed files with 284 additions and 139 deletions
--- a/spleeter/audio/init.py
+++ b/spleeter/audio/init.py
@@ -10,6 +10,19 @@
    - Waveform convertion and transforming functions.
 """
 from enum import Enum
 __email__ = 'spleeter@deezer.com'
 __author__ = 'Deezer Research'
 __license__ = 'MIT License'
 class Codec(str, Enum):
    """ Enumeration of supported audio codec. """
    WAV: str = 'wav'
    MP3: str = 'mp3'
    OGG: str = 'ogg'
    M4A: str = 'm4a'
    WMA: str = 'wma'
    FLAC: str = 'flac'
--- a/spleeter/audio/adapter.py
+++ b/spleeter/audio/adapter.py
@@ -3,21 +3,22 @@
 """ AudioAdapter class defintion. """
 import subprocess
 from abc import ABC, abstractmethod
 from importlib import import_module
-from os.path import exists
+from pathlib import Path
 from spleeter.audio import Codec
 from typing import Any, Dict, List, Union
 from .. import SpleeterError
 from ..types import AudioDescriptor, Signal
 from ..utils.logging import get_logger
 # pyright: reportMissingImports=false
 # pylint: disable=import-error
 import numpy as np
 import tensorflow as tf
 from tensorflow.signal import stft, hann_window
 # pylint: enable=import-error
 from .. import SpleeterError
 from ..utils.logging import get_logger
 __email__ = 'spleeter@deezer.com'
 __author__ = 'Deezer Research'
@@ -27,46 +28,72 @@ __license__ = 'MIT License'
 class AudioAdapter(ABC):
    """ An abstract class for manipulating audio signal. """
-    # Default audio adapter singleton instance.
+    _DEFAULT: 'AudioAdapter' = None
-    DEFAULT = None
+    """ Default audio adapter singleton instance. """
    @abstractmethod
    def load(
-            self, audio_descriptor, offset, duration,
+            self,
-            sample_rate, dtype=np.float32):
+            audio_descriptor: AudioDescriptor,
-        """ Loads the audio file denoted by the given audio descriptor
+            offset: float,
-        and returns it data as a waveform. Aims to be implemented
+            duration: float,
-        by client.
+            sample_rate: float,
            dtype: np.dtype = np.float32) -> Signal:
        """
            Loads the audio file denoted by the given audio descriptor and
            returns it data as a waveform. Aims to be implemented by client.
-        :param audio_descriptor:    Describe song to load, in case of file
+            Parameters:
-                                    based audio adapter, such descriptor would
+                audio_descriptor (AudioDescriptor):
-                                    be a file path.
+                    Describe song to load, in case of file based audio adapter,
-        :param offset:              Start offset to load from in seconds.
+                    such descriptor would be a file path.
-        :param duration:            Duration to load in seconds.
+                offset (float):
-        :param sample_rate:         Sample rate to load audio with.
+                    Start offset to load from in seconds.
-        :param dtype:               Numpy data type to use, default to float32.
+                duration (float):
-        :returns:                   Loaded data as (wf, sample_rate) tuple.
+                    Duration to load in seconds.
                sample_rate (float):
                    Sample rate to load audio with.
                dtype (numpy.dtype):
                    (Optional) Numpy data type to use, default to `float32`.
            Returns:
                Signal:
                    Loaded data as (wf, sample_rate) tuple.
        """
        pass
    def load_tf_waveform(
-            self, audio_descriptor,
+            self,
-            offset=0.0, duration=1800., sample_rate=44100,
+            audio_descriptor,
-            dtype=b'float32', waveform_name='waveform'):
+            offset: float = 0.0,
-        """ Load the audio and convert it to a tensorflow waveform.
+            duration: float = 1800.,
            sample_rate: int = 44100,
            dtype: bytes = b'float32',
            waveform_name: str = 'waveform') -> Dict[str, Any]:
        """
            Load the audio and convert it to a tensorflow waveform.
-        :param audio_descriptor:    Describe song to load, in case of file
+            Parameters:
-                                    based audio adapter, such descriptor would
+                audio_descriptor ():
-                                    be a file path.
+                    Describe song to load, in case of file based audio adapter,
-        :param offset:              Start offset to load from in seconds.
+                    such descriptor would be a file path.
-        :param duration:            Duration to load in seconds.
+                offset (float):
-        :param sample_rate:         Sample rate to load audio with.
+                    Start offset to load from in seconds.
-        :param dtype:               Numpy data type to use, default to float32.
+                duration (float):
-        :param waveform_name:       (Optional) Name of the key in output dict.
+                    Duration to load in seconds.
-        :returns:                   TF output dict with waveform as
+                sample_rate (float):
-                                    (T x chan numpy array)  and a boolean that
+                    Sample rate to load audio with.
-                                    tells whether there were an error while
+                dtype (bytes):
-                                    trying to load the waveform.
+                    (Optional)data type to use, default to `b'float32'`.
                waveform_name (str):
                    (Optional) Name of the key in output dict, default to
                    `'waveform'`.
            Returns:
                Dict[str, Any]:
                    TF output dict with waveform as `(T x chan numpy array)`
                    and a boolean that tells whether there were an error while
                    trying to load the waveform.
        """
        # Cast parameters to TF format.
        offset = tf.cast(offset, tf.float64)
@@ -100,50 +127,69 @@ class AudioAdapter(ABC):
        waveform, error = results[0]
        return {
            waveform_name: waveform,
-            f'{waveform_name}_error': error
+            f'{waveform_name}_error': error}
        }
    @abstractmethod
    def save(
-            self, path, data, sample_rate,
+            self,
-            codec=None, bitrate=None):
+            path: Union[Path, str],
-        """ Save the given audio data to the file denoted by
+            data: np.ndarray,
-        the given path.
+            sample_rate: float,
            codec: Codec = None,
            bitrate: str = None):
        """
            Save the given audio data to the file denoted by the given path.
-        :param path: Path of the audio file to save data in.
+            Parameters:
-        :param data: Waveform data to write.
+                path (Union[Path, str]):
-        :param sample_rate: Sample rate to write file in.
+                    Path like of the audio file to save data in.
-        :param codec: (Optional) Writing codec to use.
+                data (numpy.ndarray):
-        :param bitrate: (Optional) Bitrate of the written audio file.
+                    Waveform data to write.
                sample_rate (float):
                    Sample rate to write file in.
                codec ():
                    (Optional) Writing codec to use, default to `None`.
                bitrate (str):
                    (Optional) Bitrate of the written audio file, default to
                    `None`.
        """
        pass
    @classmethod
    def default(cls: type) -> 'AudioAdapter':
        """
            Builds and returns a default audio adapter instance.
-def get_default_audio_adapter():
+            Returns:
-    """ Builds and returns a default audio adapter instance.
+                AudioAdapter:
                    Default adapter instance to use.
        """
        if cls._DEFAULT is None:
            from .ffmpeg import FFMPEGProcessAudioAdapter
            cls._DEFAULT = FFMPEGProcessAudioAdapter()
        return cls._DEFAULT
-    :returns: An audio adapter instance.
+    @classmethod
-    """
+    def get(cls: type, descriptor: str) -> 'AudioAdapter':
-    if AudioAdapter.DEFAULT is None:
+        """
-        from .ffmpeg import FFMPEGProcessAudioAdapter
+            Load dynamically an AudioAdapter from given class descriptor.
        AudioAdapter.DEFAULT = FFMPEGProcessAudioAdapter()
    return AudioAdapter.DEFAULT
            Parameters:
                descriptor (str):
                    Adapter class descriptor (module.Class)
-def get_audio_adapter(descriptor):
+            Returns:
-    """ Load dynamically an AudioAdapter from given class descriptor.
+                AudioAdapter:
-
+                    Created adapter instance.
-    :param descriptor: Adapter class descriptor (module.Class)
+        """
-    :returns: Created adapter instance.
+        if not descriptor:
-    """
+            return cls.default()
-    if descriptor is None:
+        module_path: List[str] = descriptor.split('.')
-        return get_default_audio_adapter()
+        adapter_class_name: str = module_path[-1]
-    module_path = descriptor.split('.')
+        module_path: str = '.'.join(module_path[:-1])
-    adapter_class_name = module_path[-1]
+        adapter_module = import_module(module_path)
-    module_path = '.'.join(module_path[:-1])
+        adapter_class = getattr(adapter_module, adapter_class_name)
-    adapter_module = import_module(module_path)
+        if not isinstance(adapter_class, AudioAdapter):
-    adapter_class = getattr(adapter_module, adapter_class_name)
+            raise SpleeterError(
-    if not isinstance(adapter_class, AudioAdapter):
+                f'{adapter_class_name} is not a valid AudioAdapter class')
-        raise SpleeterError(
+        return adapter_class()
            f'{adapter_class_name} is not a valid AudioAdapter class')
    return adapter_class()
--- a/spleeter/audio/convertor.py
+++ b/spleeter/audio/convertor.py
@@ -3,39 +3,54 @@
 """ This module provides audio data convertion functions. """
 from ..utils.tensor import from_float32_to_uint8, from_uint8_to_float32
 # pyright: reportMissingImports=false
 # pylint: disable=import-error
 import numpy as np
 import tensorflow as tf
 # pylint: enable=import-error
 from ..utils.tensor import from_float32_to_uint8, from_uint8_to_float32
 __email__ = 'spleeter@deezer.com'
 __author__ = 'Deezer Research'
 __license__ = 'MIT License'
-def to_n_channels(waveform, n_channels):
+def to_n_channels(
-    """ Convert a waveform to n_channels by removing or
+        waveform: tf.Tensor,
-    duplicating channels if needed (in tensorflow).
+        n_channels: int) -> tf.Tensor:
    """
        Convert a waveform to n_channels by removing or duplicating channels if
        needed (in tensorflow).
-    :param waveform: Waveform to transform.
+        Parameters:
-    :param n_channels: Number of channel to reshape waveform in.
+            waveform (tensorflow.Tensor):
-    :returns: Reshaped waveform.
+                Waveform to transform.
            n_channels (int):
                Number of channel to reshape waveform in.
        Returns:
            tensorflow.Tensor:
                Reshaped waveform.
    """
    return tf.cond(
        tf.shape(waveform)[1] >= n_channels,
        true_fn=lambda: waveform[:, :n_channels],
-        false_fn=lambda: tf.tile(waveform, [1, n_channels])[:, :n_channels]
+        false_fn=lambda: tf.tile(waveform, [1, n_channels])[:, :n_channels])
    )
-def to_stereo(waveform):
+def to_stereo(waveform: np.ndarray) -> np.ndarray:
-    """ Convert a waveform to stereo by duplicating if mono,
+    """
-    or truncating if too many channels.
+        Convert a waveform to stereo by duplicating if mono, or truncating
        if too many channels.
-    :param waveform: a (N, d) numpy array.
+        Parameters:
-    :returns: A stereo waveform as a (N, 1) numpy array.
+            waveform (numpy.ndarray):
                a `(N, d)` numpy array.
        Returns:
            numpy.ndarray:
                A stereo waveform as a `(N, 1)` numpy array.
    """
    if waveform.shape[1] == 1:
        return np.repeat(waveform, 2, axis=-1)
@@ -44,45 +59,84 @@ def to_stereo(waveform):
    return waveform
-def gain_to_db(tensor, espilon=10e-10):
+def gain_to_db(tensor: tf.Tensor, espilon: float = 10e-10) -> tf.Tensor:
-    """ Convert from gain to decibel in tensorflow.
+    """
        Convert from gain to decibel in tensorflow.
-    :param tensor: Tensor to convert.
+        Parameters:
-    :param epsilon: Operation constant.
+            tensor (tensorflow.Tensor):
-    :returns: Converted tensor.
+                Tensor to convert
            epsilon (float):
                Operation constant.
        Returns:
            tensorflow.Tensor:
                Converted tensor.
    """
    return 20. / np.log(10) * tf.math.log(tf.maximum(tensor, espilon))
-def db_to_gain(tensor):
+def db_to_gain(tensor: tf.Tensor) -> tf.Tensor:
-    """ Convert from decibel to gain in tensorflow.
+    """
        Convert from decibel to gain in tensorflow.
-    :param tensor_db: Tensor to convert.
+        Parameters:
-    :returns: Converted tensor.
+            tensor (tensorflow.Tensor):
                Tensor to convert
        Returns:
            tensorflow.Tensor:
                Converted tensor.
    """
    return tf.pow(10., (tensor / 20.))
-def spectrogram_to_db_uint(spectrogram, db_range=100., **kwargs):
+def spectrogram_to_db_uint(
-    """ Encodes given spectrogram into uint8 using decibel scale.
+        spectrogram: tf.Tensor,
-
+        db_range: float = 100.,
-    :param spectrogram: Spectrogram to be encoded as TF float tensor.
+        **kwargs) -> tf.Tensor:
    :param db_range: Range in decibel for encoding.
    :returns: Encoded decibel spectrogram as uint8 tensor.
    """
-    db_spectrogram = gain_to_db(spectrogram)
+        Encodes given spectrogram into uint8 using decibel scale.
-    max_db_spectrogram = tf.reduce_max(db_spectrogram)
+
-    db_spectrogram = tf.maximum(db_spectrogram, max_db_spectrogram - db_range)
+        Parameters:
            spectrogram (tensorflow.Tensor):
                Spectrogram to be encoded as TF float tensor.
            db_range (float):
                Range in decibel for encoding.
        Returns:
            tensorflow.Tensor:
                Encoded decibel spectrogram as `uint8` tensor.
    """
    db_spectrogram: tf.Tensor = gain_to_db(spectrogram)
    max_db_spectrogram: tf.Tensor = tf.reduce_max(db_spectrogram)
    db_spectrogram: tf.Tensor = tf.maximum(
        db_spectrogram,
        max_db_spectrogram - db_range)
    return from_float32_to_uint8(db_spectrogram, **kwargs)
-def db_uint_spectrogram_to_gain(db_uint_spectrogram, min_db, max_db):
+def db_uint_spectrogram_to_gain(
-    """ Decode spectrogram from uint8 decibel scale.
+        db_uint_spectrogram: tf.Tensor,
-
+        min_db: tf.Tensor,
-    :param db_uint_spectrogram: Decibel pectrogram to decode.
+        max_db: tf.Tensor) -> tf.Tensor:
    :param min_db: Lower bound limit for decoding.
    :param max_db: Upper bound limit for decoding.
    :returns: Decoded spectrogram as float2 tensor.
    """
-    db_spectrogram = from_uint8_to_float32(db_uint_spectrogram, min_db, max_db)
+        Decode spectrogram from uint8 decibel scale.
        Paramters:
            db_uint_spectrogram (tensorflow.Tensor):
                Decibel spectrogram to decode.
            min_db (tensorflow.Tensor):
                Lower bound limit for decoding.
            max_db (tensorflow.Tensor):
                Upper bound limit for decoding.
        Returns:
            tensorflow.Tensor:
                Decoded spectrogram as `float32` tensor.
    """
    db_spectrogram: tf.Tensor = from_uint8_to_float32(
        db_uint_spectrogram,
        min_db,
        max_db)
    return db_to_gain(db_spectrogram)
--- a/spleeter/audio/spectrogram.py
+++ b/spleeter/audio/spectrogram.py
@@ -3,6 +3,7 @@
 """ Spectrogram specific data augmentation """
 # pyright: reportMissingImports=false
 # pylint: disable=import-error
 import numpy as np
 import tensorflow as tf
--- a/spleeter/commands/init.py
+++ b/spleeter/commands/init.py
@@ -51,34 +51,38 @@ OPT_PARAMS = {
    'help': 'JSON filename that contains params'
 }
-# -s opt specification (separate).
+Offset: OptionInfo = Option(
-OPT_OFFSET = {
+    0.,
-    'dest': 'offset',
+    '--offset',
-    'type': float,
+    '-s',
-    'default': 0.,
+    help='Set the starting offset to separate audio from')
    'help': 'Set the starting offset to separate audio from.'
 }
-# -d opt specification (separate).
+Duration: OptionInfo = Option(
-OPT_DURATION = {
+    600.,
-    'dest': 'duration',
+    '--duration',
-    'type': float,
+    '-d',
-    'default': 600.,
+    help=(
    'help': (
        'Set a maximum duration for processing audio '
        '(only separate offset + duration first seconds of '
-        'the input file)')
+        'the input file)'))
 }
-# -w opt specification (separate)
+
-OPT_STFT_BACKEND = {
+class STFTBackendEnum(Enum, str):
-    'dest': 'stft_backend',
+
-    'type': str,
+    AUTO: str
-    'choices' : ["tensorflow", "librosa", "auto"],
+    TENSORFLOW: str
-    'default': "auto",
+    LIBROSA: str
-    'help': 'Who should be in charge of computing the stfts. Librosa is faster than tensorflow on CPU and uses'
+
-            ' less memory. "auto" will use tensorflow when GPU acceleration is available and librosa when not.'
+
-}
+STFTBackend: OptionInfo = Option(
    STFTBackendEnum.AUTO,
    '--stft-backend',
    '-B',
    case_sensitive=False,
    help=(
        'Who should be in charge of computing the stfts. Librosa is faster '
        'than tensorflow on CPU and uses  less memory. "auto" will use '
        'tensorflow when GPU acceleration is available and librosa when not'))
 # -c opt specification (separate).
@@ -128,6 +132,14 @@ OPT_ADAPTER = {
    'help': 'Name of the audio adapter to use for audio I/O'
 }
 AudioAdapter: OptionInfo = Option(
    'spleeter.audio.ffmpeg.FFMPEGProcessAudioAdapter',
    '--adapter',
    help='Name of the audio adapter to use for audio I/O')
 # -a opt specification (train, evaluate and separate).
 OPT_VERBOSE = {
    'action': 'store_true',
--- a/spleeter/commands/separate.py
+++ b/spleeter/commands/separate.py
@@ -19,6 +19,10 @@ __author__ = 'Deezer Research'
 __license__ = 'MIT License'
 from typer import Option
 AudioAdapter = Option()
 def entrypoint(arguments, params):
    """ Command entrypoint.
--- a/spleeter/types.py
+++ b/spleeter/types.py
@@ -0,0 +1,15 @@
 #!/usr/bin/env python
 # coding: utf8
 """ TO DOCUMENT """
 from typing import Any, Tuple
 # pyright: reportMissingImports=false
 # pylint: disable=import-error
 import numpy as np
 # pylint: enable=import-error
 AudioDescriptor: type = Any
 Signal: type = Tuple[np.ndarray, float]