It seems like in your case a simple thresholding would work. To ensure we don't clip prematurely, we will require that at least k
values exceed the threshold before truncating the audio.
import librosa
import numpy as np
def first_occ_index(w, n):
# Borrowed from https://stackoverflow.com/questions/49693770/get-index-of-the-first-block-of-at-least-n-consecutive-false-values-in-boolean-a
idx = np.flatnonzero(np.r_[True, w, True])
lens = np.diff(idx) - 1
return idx[(lens >= n).argmax()]
X, fs = librosa.load('your.audio')
threshold = 3 * X.std() # or e.g. 0.6 * X.max() - play with it
X_th = np.abs(X) < threshold
k = 20 # we require 20 consecutive values above the threshold
idx_to_cut = first_occ_index(X_th, k)
my_audio = X[:idx_to_cut]
garbage = X[idx_to_cut:]