So I am making an AI project that classifies speech into either "up", "down", "left", right or background noise, and from this, a character in a videogame is moved.
I have made an FFT algorithm deriving it from the mathematical explanation, which I believe is correct as I have tested its output against that from this site (https://engineering.icalculator.info/discrete-fourier-transform-calculator.html)
I then have tried to generate a spectrogram and have used code based on the code from the main function of the App class from this site (Creating spectrogram from .wav using FFT in java)
I tested my code on a .wav file of me saying hello and the spectrogram generated is not what I was expecting out, see below the difference between my java made spectrogram and my python made spectrogram (ignore the colour difference).
New Java Spectrogram with SleuthEyes help
Here is the original code I have used/written:
package STACKOVERFLOW;
import com.company.Complex;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Scanner;
public class StackOverFlow {
private static Color getColour(double power) {
var H = power * 0.4;
var S = 1.0;
var B = 1.0;
return Color.getHSBColor((float) H, (float) S, (float) B);
}
private static double[] getAudioData(String filePath) {
var path = Paths.get(filePath);
try {
var entireFileData = Files.readAllBytes(path);
var rawData = Arrays.copyOfRange(entireFileData, 44, entireFileData.length);
var length = rawData.length;
var newLength = length / 4;
var dataMono = new double[newLength];
double left, right;
for (int i = 0; 2 * i + 3< newLength; i++) {
left = (short) ((rawData[2 * i + 1] & 0xff) << 8) | (rawData[2 * i] & 0xff);
right = (short) ((rawData[2 * i + 3] & 0xff) << 8) | (rawData[2 * i + 2] & 0xff);
dataMono[i] = (left + right) / 2.0;
}
return dataMono;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
private static Complex[] toComplex(double[] samples) {
var l = samples.length;
var cOut = new Complex[l];
for (int i = 0; i < l; i++) {
cOut[i] = new Complex(samples[i], 0);
}
return cOut;
}
private static double modulusSquared(Complex a) {
var real = a.getReal();
var imaginary = a.getImag();
return (real * real) + (imaginary * imaginary);
}
private static Complex[] fft(Complex[] samples) {
var N = samples.length; // number of samples
if (N == 1) return samples; // stops the recursive splits on the samples
// TODO: M only works for N a power of 2
var M = N / 2; // middle index of the samples
var Xeven = new Complex[M]; // array for even split
var Xodd = new Complex[M]; // array for odd split
// splits the samples
for (int i = 0; i < M; i++) {
Xeven[i] = samples[2 * i];
Xodd[i] = samples[2 * i + 1];
}
// recursive calls on even and odd samples
var Feven = new Complex[M];
Feven = fft(Xeven);
var Fodd = new Complex[M];
Fodd = fft(Xodd);
var frequencyBins = new Complex[N];
for (int i = 0; i < (N / 2); i++) {
var cExponential = Complex.multiply(
Complex.polar(1, -2 * Math.PI * i / N),
Fodd[i]
);
frequencyBins[i] = Complex.add(
Feven[i],
cExponential
);
frequencyBins[i + N / 2] = Complex.sub(
Feven[i],
cExponential
);
}
return frequencyBins;
}
public static void makeSpectrogram() {
var scan = new Scanner(System.in);
System.out.println("Enter file path: ");
var filePath = scan.nextLine();
var rawAudioData = getAudioData(filePath);
assert rawAudioData != null;
var length = rawAudioData.length;
var complexAudioData = toComplex(rawAudioData);
// parameters for FFT
var windowSize = 256;
var overlapFactor = 2;
var windowStep = windowSize / overlapFactor;
// plotData array
var nX = (length - windowSize) / windowStep;
var nY = (windowSize / 2);
var plotData = new double[nX][nY];
// amplitudes to normalise
var maxAmplitude = Double.MIN_VALUE;
var minAmplitude = Double.MAX_VALUE;
double amplitudeSquared;
// application of the FFT
for (int i = 0; i < nX; i++) {
var windowSizeArray = fft(Arrays.copyOfRange(complexAudioData, i * windowStep, i * windowStep + windowSize));
for (int j = 0; j < nY; j++) {
amplitudeSquared = modulusSquared(windowSizeArray[2 * j]);
if (amplitudeSquared == 0.0) {
plotData[i][nY - j - 1] = amplitudeSquared;
} else {
var threshold = 1.0; // prevents log(0)
plotData[i][nY - j - 1] = 10 * Math.log10(Math.max(amplitudeSquared, threshold));
}
// find min and max amplitudes
if (plotData[i][j] > maxAmplitude) {
maxAmplitude = plotData[i][j];
} else if (plotData[i][j] < minAmplitude) {
minAmplitude = plotData[i][j];
}
}
}
// normalisation
var difference = maxAmplitude - minAmplitude;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
plotData[i][j] = (plotData[i][j] - minAmplitude) / difference;
}
}
// plot the spectrogram
var spectrogram = new BufferedImage(nX, nY, BufferedImage.TYPE_INT_RGB);
double ratio;
for (int i = 0; i < nX; i++) {
for (int j = 0; j < nY; j++) {
ratio = plotData[i][j];
var colour = getColour(1.0 - ratio);
spectrogram.setRGB(i, j, colour.getRGB());
}
}
// write the image to a file
try {
var outputFile = new File("saved.png");
ImageIO.write(spectrogram, "png", outputFile);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
makeSpectrogram();
}
}
Here is the Complex class that is used above:
package com.company;
import java.text.DecimalFormat;
public class Complex {
private final static DecimalFormat df2 = new DecimalFormat("#.##");
private double r;
private double i;
public Complex(double r, double i) {
this.r = r;
this.i = i;
}
@Override
public String toString() {
return "(" + df2.format(this.r) + ", " + df2.format(this.i) + "i) ";
}
public double abs() {
return Math.hypot(this.r, this.i);
}
public double getReal() {
return this.r;
}
public double getImag() {
return this.i;
}
public void setReal(double r) {
this.r = r;
}
public void setImag(double i) {
this.i = i;
}
public static Complex polar(double r, double theta) {
return new Complex(
r * Math.cos(theta),
r * Math.sin(theta)
);
}
public static Complex multiply(Complex a, Complex b) {
/*
(a + bi) * (c + di) =
ac + adi + cbi + -bd =
(ac - bd) + (ad + cb)i
*/
var real = (a.r * b.r) - (a.i * b.i);
var imag = (a.r * b.i) + (a.i * b.r);
return new Complex(real, imag);
}
public static Complex add(Complex a, Complex b) {
return new Complex(
a.r + b.r,
a.i + b.i
);
}
public static Complex sub(Complex a, Complex b) {
return new Complex(
a.r - b.r,
a.i - b.i
);
}
}
any guidance would be appreciated