Merge pull request #426 from deezer/fix_librosa_istft_edge

Fixing gltches issues with Istft
2026-03-30 12:27:03 +00:00 · 2020-07-24 17:05:58 +02:00
parent ca5cdd7d28 c6cc510069
commit 47b990e5f2
4 changed files with 60 additions and 58 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog History

+## 1.5.4
+
+First release, July 24th 2020
+
+Add some padding of the input waveform to avoid separation artefacts on the edges due to unstabilities in the inverse fourier transforms.
+Also add tests to ensure both librosa and tensorflow backends have same outputs.
+
 ## 1.5.2

 First released, May 15th 2020
--- a/spleeter/separator.py
+++ b/spleeter/separator.py
@@ -123,14 +123,18 @@ class Separator(object):
        data = np.asfortranarray(data)
        N = self._params["frame_length"]
        H = self._params["frame_step"]
+        
        win = hann(N, sym=False)
        fstft = istft if inverse else stft
-        win_len_arg = {"win_length": None, "length": length} if inverse else {"n_fft": N}
+        win_len_arg = {"win_length": None,
+                       "length": None} if inverse else {"n_fft": N}
        n_channels = data.shape[-1]
        out = []
        for c in range(n_channels):
-            d = data[:, :, c].T if inverse else data[:, c]
+            d = np.concatenate((np.zeros((N, )), data[:, c], np.zeros((N, )))) if not inverse else data[:, :, c].T
            s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
+            if inverse:
+                s = s[N:N+length]
            s = np.expand_dims(s.T, 2-inverse)
            out.append(s)
        if len(out) == 1:
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -29,57 +29,29 @@ BACKENDS = ["tensorflow", "librosa"]
 TEST_CONFIGURATIONS = {el:el for el in BACKENDS}

 res_4stems = {
-                "librosa": {
-                    "vocals": {
-                        "SDR": -0.007,
-                        "SAR": -19.231,
-                        "SIR": -4.528,
-                        "ISR": 0.000
-                    },
-                    "drums": {
-                        "SDR": -0.071,
-                        "SAR": -14.496,
-                        "SIR": -4.987,
-                        "ISR": 0.001
-                    },
-                    "bass":{
-                        "SDR": -0.001,
-                        "SAR": -12.426,
-                        "SIR": -7.198,
-                        "ISR": -0.001
-                    },
-                    "other":{
-                        "SDR": -1.453,
-                        "SAR": -14.899,
-                        "SIR": -4.678,
-                        "ISR": -0.015
-                    }
+                "vocals": {
+                    "SDR": 3.25e-05,
+                    "SAR": -11.153575,
+                    "SIR": -1.3849,
+                    "ISR": 2.75e-05
                },
-                "tensorflow": {
-                    "vocals": {
-                        "SDR": 3.25e-05,
-                        "SAR": -11.153575,
-                        "SIR": -1.3849,
-                        "ISR": 2.75e-05
-                    },
-                    "drums": {
-                        "SDR": -0.079505,
-                        "SAR": -15.7073575,
-                        "SIR": -4.972755,
-                        "ISR": 0.0013575
-                    },
-                    "bass":{
-                        "SDR": 2.5e-06,
-                        "SAR": -10.3520575,
-                        "SIR": -4.272325,
-                        "ISR": 2.5e-06
-                    },
-                    "other":{
-                        "SDR": -1.359175,
-                        "SAR": -14.7076775,
-                        "SIR": -4.761505,
-                        "ISR": -0.01528
-                    }
+                "drums": {
+                    "SDR": -0.079505,
+                    "SAR": -15.7073575,
+                    "SIR": -4.972755,
+                    "ISR": 0.0013575
+                },
+                "bass":{
+                    "SDR": 2.5e-06,
+                    "SAR": -10.3520575,
+                    "SIR": -4.272325,
+                    "ISR": 2.5e-06
+                },
+                "other":{
+                    "SDR": -1.359175,
+                    "SAR": -14.7076775,
+                    "SIR": -4.761505,
+                    "ISR": -0.01528
                }
            }

@@ -102,15 +74,11 @@ def generate_fake_eval_dataset(path):
@pytest.mark.parametrize('backend', TEST_CONFIGURATIONS)
 def test_evaluate(backend):
    with TemporaryDirectory() as directory:
-
        generate_fake_eval_dataset(directory)
        p = create_argument_parser()
        arguments = p.parse_args(["evaluate", "-p", "spleeter:4stems", "--mus_dir", directory, "-B", backend])
        params = load_configuration(arguments.configuration)
        metrics = evaluate.entrypoint(arguments, params)
        for instrument, metric in metrics.items():
-            for metric, value in metric.items():
-                assert np.allclose(np.median(value), res_4stems[backend][instrument][metric], atol=1e-3)
-
-
-# test_evaluate("tensorflow")
+            for m, value in metric.items():
+                assert np.allclose(np.median(value), res_4stems[instrument][m], atol=1e-3)
--- a/tests/test_separator.py
+++ b/tests/test_separator.py
@@ -39,6 +39,29 @@ TEST_CONFIGURATIONS = list(itertools.product(TEST_AUDIO_DESCRIPTORS, MODELS, BAC
 print("RUNNING TESTS WITH TF VERSION {}".format(tf.__version__))


+@pytest.mark.parametrize('test_file', TEST_AUDIO_DESCRIPTORS)
+def test_separator_backends(test_file):
+    adapter = get_default_audio_adapter()
+    waveform, _ = adapter.load(test_file)
+
+    separator_lib = Separator("spleeter:2stems", stft_backend="librosa")
+    separator_tf = Separator("spleeter:2stems", stft_backend="tensorflow")
+
+    # Test the stft and inverse stft provides exact reconstruction
+    stft_matrix = separator_lib._stft(waveform)
+    reconstructed = separator_lib._stft(
+        stft_matrix, inverse=True, length=waveform.shape[0])
+    assert np.allclose(reconstructed, waveform, atol=3e-2)
+
+    # compare both separation, it should be close
+    out_tf = separator_tf._separate_tensorflow(waveform, test_file)
+    out_lib = separator_lib._separate_librosa(waveform, test_file)
+
+    for instrument in out_lib.keys():
+        # test that both outputs are close everywhere
+        assert np.allclose(out_tf[instrument], out_lib[instrument], atol=1e-5)
+
+
@pytest.mark.parametrize('test_file, configuration, backend', TEST_CONFIGURATIONS)
 def test_separate(test_file, configuration, backend):
    """ Test separation from raw data. """