Merge pull request #426 from deezer/fix_librosa_istft_edge

Fixing gltches issues with Istft
2026-01-31 14:58:23 +00:00 · 2020-07-24 17:05:58 +02:00
parent ca5cdd7d28 c6cc510069
commit 47b990e5f2
4 changed files with 60 additions and 58 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog History
 ## 1.5.4
 First release, July 24th 2020
 Add some padding of the input waveform to avoid separation artefacts on the edges due to unstabilities in the inverse fourier transforms.
 Also add tests to ensure both librosa and tensorflow backends have same outputs.
 ## 1.5.2
 First released, May 15th 2020
--- a/spleeter/separator.py
+++ b/spleeter/separator.py
@@ -123,14 +123,18 @@ class Separator(object):
        data = np.asfortranarray(data)
        N = self._params["frame_length"]
        H = self._params["frame_step"]
        win = hann(N, sym=False)
        fstft = istft if inverse else stft
-        win_len_arg = {"win_length": None, "length": length} if inverse else {"n_fft": N}
+        win_len_arg = {"win_length": None,
                       "length": None} if inverse else {"n_fft": N}
        n_channels = data.shape[-1]
        out = []
        for c in range(n_channels):
-            d = data[:, :, c].T if inverse else data[:, c]
+            d = np.concatenate((np.zeros((N, )), data[:, c], np.zeros((N, )))) if not inverse else data[:, :, c].T
            s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
            if inverse:
                s = s[N:N+length]
            s = np.expand_dims(s.T, 2-inverse)
            out.append(s)
        if len(out) == 1:
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -29,57 +29,29 @@ BACKENDS = ["tensorflow", "librosa"]
 TEST_CONFIGURATIONS = {el:el for el in BACKENDS}
 res_4stems = {
-                "librosa": {
+                "vocals": {
-                    "vocals": {
+                    "SDR": 3.25e-05,
-                        "SDR": -0.007,
+                    "SAR": -11.153575,
-                        "SAR": -19.231,
+                    "SIR": -1.3849,
-                        "SIR": -4.528,
+                    "ISR": 2.75e-05
                        "ISR": 0.000
                    },
                    "drums": {
                        "SDR": -0.071,
                        "SAR": -14.496,
                        "SIR": -4.987,
                        "ISR": 0.001
                    },
                    "bass":{
                        "SDR": -0.001,
                        "SAR": -12.426,
                        "SIR": -7.198,
                        "ISR": -0.001
                    },
                    "other":{
                        "SDR": -1.453,
                        "SAR": -14.899,
                        "SIR": -4.678,
                        "ISR": -0.015
                    }
                },
-                "tensorflow": {
+                "drums": {
-                    "vocals": {
+                    "SDR": -0.079505,
-                        "SDR": 3.25e-05,
+                    "SAR": -15.7073575,
-                        "SAR": -11.153575,
+                    "SIR": -4.972755,
-                        "SIR": -1.3849,
+                    "ISR": 0.0013575
-                        "ISR": 2.75e-05
+                },
-                    },
+                "bass":{
-                    "drums": {
+                    "SDR": 2.5e-06,
-                        "SDR": -0.079505,
+                    "SAR": -10.3520575,
-                        "SAR": -15.7073575,
+                    "SIR": -4.272325,
-                        "SIR": -4.972755,
+                    "ISR": 2.5e-06
-                        "ISR": 0.0013575
+                },
-                    },
+                "other":{
-                    "bass":{
+                    "SDR": -1.359175,
-                        "SDR": 2.5e-06,
+                    "SAR": -14.7076775,
-                        "SAR": -10.3520575,
+                    "SIR": -4.761505,
-                        "SIR": -4.272325,
+                    "ISR": -0.01528
                        "ISR": 2.5e-06
                    },
                    "other":{
                        "SDR": -1.359175,
                        "SAR": -14.7076775,
                        "SIR": -4.761505,
                        "ISR": -0.01528
                    }
                }
            }
@@ -102,15 +74,11 @@ def generate_fake_eval_dataset(path):
@pytest.mark.parametrize('backend', TEST_CONFIGURATIONS)
 def test_evaluate(backend):
    with TemporaryDirectory() as directory:
        generate_fake_eval_dataset(directory)
        p = create_argument_parser()
        arguments = p.parse_args(["evaluate", "-p", "spleeter:4stems", "--mus_dir", directory, "-B", backend])
        params = load_configuration(arguments.configuration)
        metrics = evaluate.entrypoint(arguments, params)
        for instrument, metric in metrics.items():
-            for metric, value in metric.items():
+            for m, value in metric.items():
-                assert np.allclose(np.median(value), res_4stems[backend][instrument][metric], atol=1e-3)
+                assert np.allclose(np.median(value), res_4stems[instrument][m], atol=1e-3)
 # test_evaluate("tensorflow")
--- a/tests/test_separator.py
+++ b/tests/test_separator.py
@@ -39,6 +39,29 @@ TEST_CONFIGURATIONS = list(itertools.product(TEST_AUDIO_DESCRIPTORS, MODELS, BAC
 print("RUNNING TESTS WITH TF VERSION {}".format(tf.__version__))
@pytest.mark.parametrize('test_file', TEST_AUDIO_DESCRIPTORS)
 def test_separator_backends(test_file):
    adapter = get_default_audio_adapter()
    waveform, _ = adapter.load(test_file)
    separator_lib = Separator("spleeter:2stems", stft_backend="librosa")
    separator_tf = Separator("spleeter:2stems", stft_backend="tensorflow")
    # Test the stft and inverse stft provides exact reconstruction
    stft_matrix = separator_lib._stft(waveform)
    reconstructed = separator_lib._stft(
        stft_matrix, inverse=True, length=waveform.shape[0])
    assert np.allclose(reconstructed, waveform, atol=3e-2)
    # compare both separation, it should be close
    out_tf = separator_tf._separate_tensorflow(waveform, test_file)
    out_lib = separator_lib._separate_librosa(waveform, test_file)
    for instrument in out_lib.keys():
        # test that both outputs are close everywhere
        assert np.allclose(out_tf[instrument], out_lib[instrument], atol=1e-5)
@pytest.mark.parametrize('test_file, configuration, backend', TEST_CONFIGURATIONS)
 def test_separate(test_file, configuration, backend):
    """ Test separation from raw data. """