From 0016e3519274ec2dc766c2eac40539d9a44c0185 Mon Sep 17 00:00:00 2001
From: mmoussallam <manuel.moussallam@deezer.com>
Date: Fri, 24 Jul 2020 16:32:32 +0200
Subject: [PATCH] align the padding in librosa to what is now done in tf
 backend

---
 spleeter/separator.py   |  8 ++---
 tests/test_eval.py      | 74 +++++++++++++----------------------------
 tests/test_separator.py | 18 +---------
 3 files changed, 27 insertions(+), 73 deletions(-)

diff --git a/spleeter/separator.py b/spleeter/separator.py
index 9318911..2c636b4 100644
--- a/spleeter/separator.py
+++ b/spleeter/separator.py
@@ -123,7 +123,7 @@ class Separator(object):
         data = np.asfortranarray(data)
         N = self._params["frame_length"]
         H = self._params["frame_step"]
-        F = int(N/2) + 1
+        
         win = hann(N, sym=False)
         fstft = istft if inverse else stft
         win_len_arg = {"win_length": None,
@@ -131,12 +131,10 @@ class Separator(object):
         n_channels = data.shape[-1]
         out = []
         for c in range(n_channels):
-            d = np.concatenate((np.zeros((F, 1)), data[:, :, c].T, np.zeros(
-                (F, 1))), axis=1) if inverse else data[:, c]
+            d = np.concatenate((np.zeros((N, )), data[:, c], np.zeros((N, )))) if not inverse else data[:, :, c].T
             s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
             if inverse:
-                s = s[H:]
-                s = s[:length]
+                s = s[N:N+length]
             s = np.expand_dims(s.T, 2-inverse)
             out.append(s)
         if len(out) == 1:
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 547d1e5..97540a9 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -29,57 +29,29 @@ BACKENDS = ["tensorflow", "librosa"]
 TEST_CONFIGURATIONS = {el:el for el in BACKENDS}
 
 res_4stems = {
-                "librosa": {
-                     "vocals": {
-                        "SDR": 0.000,
-                        "SAR": -16.212,
-                        "SIR": -4.172,
-                        "ISR": 0.000
-                    },
-                    "drums": {
-                        "SDR": -0.077,
-                        "SAR": -15.739,
-                        "SIR": -5.045,
-                        "ISR": 0.001
-                    },
-                    "bass":{
-                        "SDR": -0.000,
-                        "SAR": -10.665,
-                        "SIR": -5.646,
-                        "ISR": -0.000
-                    },
-                    "other":{
-                        "SDR": -1.309,
-                        "SAR": -14.573,
-                        "SIR": -4.705,
-                        "ISR": -0.014
-                    }
+                "vocals": {
+                    "SDR": 3.25e-05,
+                    "SAR": -11.153575,
+                    "SIR": -1.3849,
+                    "ISR": 2.75e-05
                 },
-                "tensorflow": {
-                    "vocals": {
-                        "SDR": 3.25e-05,
-                        "SAR": -11.153575,
-                        "SIR": -1.3849,
-                        "ISR": 2.75e-05
-                    },
-                    "drums": {
-                        "SDR": -0.079505,
-                        "SAR": -15.7073575,
-                        "SIR": -4.972755,
-                        "ISR": 0.0013575
-                    },
-                    "bass":{
-                        "SDR": 2.5e-06,
-                        "SAR": -10.3520575,
-                        "SIR": -4.272325,
-                        "ISR": 2.5e-06
-                    },
-                    "other":{
-                        "SDR": -1.359175,
-                        "SAR": -14.7076775,
-                        "SIR": -4.761505,
-                        "ISR": -0.01528
-                    }
+                "drums": {
+                    "SDR": -0.079505,
+                    "SAR": -15.7073575,
+                    "SIR": -4.972755,
+                    "ISR": 0.0013575
+                },
+                "bass":{
+                    "SDR": 2.5e-06,
+                    "SAR": -10.3520575,
+                    "SIR": -4.272325,
+                    "ISR": 2.5e-06
+                },
+                "other":{
+                    "SDR": -1.359175,
+                    "SAR": -14.7076775,
+                    "SIR": -4.761505,
+                    "ISR": -0.01528
                 }
             }
 
@@ -109,4 +81,4 @@ def test_evaluate(backend):
         metrics = evaluate.entrypoint(arguments, params)
         for instrument, metric in metrics.items():
             for m, value in metric.items():
-                assert np.allclose(np.median(value), res_4stems[backend][instrument][m], atol=1e-3)
+                assert np.allclose(np.median(value), res_4stems[instrument][m], atol=1e-3)
diff --git a/tests/test_separator.py b/tests/test_separator.py
index d850e97..3094900 100644
--- a/tests/test_separator.py
+++ b/tests/test_separator.py
@@ -53,29 +53,13 @@ def test_separator_backends(test_file):
         stft_matrix, inverse=True, length=waveform.shape[0])
     assert np.allclose(reconstructed, waveform, atol=3e-2)
 
-    # # now also test that tensorflow and librosa STFT provide same results
-    from spleeter.audio.spectrogram import compute_spectrogram_tf
-    tf_waveform = tf.convert_to_tensor(waveform, tf.float32)
-    spectrogram_tf = compute_spectrogram_tf(tf_waveform,
-                                            separator_tf._params['frame_length'],
-                                            separator_tf._params['frame_step'],)
-    with tf.Session() as sess:
-        spectrogram_tf_eval = spectrogram_tf.eval()
-
-    # check that stfts are equivalent
-    assert stft_matrix.shape == spectrogram_tf_eval.shape
-    assert np.allclose(
-        np.abs(stft_matrix), spectrogram_tf_eval, atol=1e-2)
-
     # compare both separation, it should be close
     out_tf = separator_tf._separate_tensorflow(waveform, test_file)
     out_lib = separator_lib._separate_librosa(waveform, test_file)
 
     for instrument in out_lib.keys():
         # test that both outputs are close everywhere
-        assert np.allclose(out_tf[instrument], out_lib[instrument], atol=0.025)
-        # it should be even more similar outside edges zones
-        assert np.allclose(out_tf[instrument][4096:-4096,:], out_lib[instrument][4096:-4096,:], atol=0.002)
+        assert np.allclose(out_tf[instrument], out_lib[instrument], atol=1e-5)
 
 
 @pytest.mark.parametrize('test_file, configuration, backend', TEST_CONFIGURATIONS)