Your code have a complexity of O(N*N) [genesPerSet
=N]. But using the fact that the order of the values is irrelevant for you we can order it in O(N•log(N)) and compute the “scores” in O(N). (With potentially can be thousands time faster).
Also, we have a total of N*N comparisons. Then U_Original + U_Random
= N*N, meaning we don’t need to compute U_Random.
Also your statistic Zn= Umin-N*N/2;[Umix=min(U_Original,U_Random)],
when you only abs(Zn/Zd) is symmetric around N*N/2. We need only one algorithm.
1.- A first thing can be to take arguments by (const) reference :
int WilcoxinRST::work(const GeneSet &originalGeneSet, const vector<string> &randomGenes)
2.- You fill into vector geneIDs; but dont use it ? Why?
3.- You can iterate only 2 times.
4.- You keep the signal values (probe intensitat?) together in one vector and use another vector to signal what is each item – simple keep in two vector.
5.- You don’t need the score vector, only the summa.
6.- Why 20 millions times? I gess your are computing some “statistic” stability or BStrap. Probably you use the same originalGeneSet many time. I think you can in another question post the code that call this function to spare in making each time the vector of values and sorting.
Here is first the new O(N•log(N)) code.
What follows is a cleanup of your code but still O(N*N), fast but only by a constant factor.
Then the same code but mixed with yours original code and with more comments.
Please, debugg this and tell me how was.
#include<vector>
#include<algorithm>
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
std::vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
/**
* Fill the valueOri vector with original gene set data, and valueRnd with random data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
valueOri[i] = std::fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = std::fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
std::sort(valueOri.begin(),valueOri.end());
std::sort(valueRnd.begin(),valueRnd.end());
/**
* calculate the scores Ua, Ub and U
*/
long U_Original ;
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic
statType == FDR_PValue || statType == PValue )
{
// Higher value is a winner
size_t j=0;
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{
while(valueOri[i] > valueRnd[j]) ++j ;
U_Original += j;
}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Original - ((genesPerSet * genesPerSet) / 2);
Zd = std::sqrt( (double) (((genesPerSet * genesPerSet* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (std::fabs(Z) > 2.303) return 2;
else if (std::fabs(Z) > 1.605) return 1;
else return 0;
}
What follows is a cleanup of your code but still O(N*N), fast but only by a constant factor.
#include<vector>
using namespace std;
class GeneSet ;
class WilcoxinRST;
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
/**
* Fill the valueOri vector with original gene set data, and valueRnd with random data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
valueOri[i] = fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
/**
* calculate the scores Ua, Ub and U
*/
long U_Original = 0, U_Random = 0, U_Final;
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic)
{
// Higher value is a winner
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{ for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] > valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
U_Original+= (valueOri[i] > valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
}
}
} else
if (statType == FDR_PValue || statType == PValue)
{
// Lower value is a winner
for (size_t i = 0; i < genesPerSet; i++)
{
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] < valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are > than this Rnd
U_Original+= (valueOri[i] < valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are > than this Ori
}
}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
U_Final = (U_Original < U_Random) ? U_Original : U_Random;
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Final - ((genesPerSet * genesPerSet) / 2);
Zd = sqrt(
(double) (((genesPerSet * genesPerSet
* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (fabs(Z) > 2.303) return 2;
else if (fabs(Z) > 1.605) return 1;
else return 0;
}
the same code but mixed with yours original code and with more comments.
int WilcoxinRST::work(const GeneSet &originalGeneSet , const vector<string>& randomGenes)
{
size_t genesPerSet = originalGeneSet.geneCount();
unsigned int totalGenes, tempScore;
totalGenes = genesPerSet * 2;
//vector<string> geneIDs;
//vector<bool> isOriginal;
//vector<int> rank;
vector<double> valueOri(genesPerSet), valueRnd(genesPerSet);
//vector<int> score;
/**
* Fill the first half of the vectors with original gene set data
*/
for (size_t i = 0; i < genesPerSet; i++)
{
//geneIDs.push_back( originalGeneSet.getMemberGeneAt(i) );
//isOriginal.push_back(true);
valueOri[i] = fabs(expressionLevels.getValue( originalGeneSet.getMemberGeneAt(i) , statType ));
valueRnd[i] = fabs(expressionLevels.getValue( randomGenes.at(i) , statType ));
}
/**
* Fill the second half with random data
*/
//for (unsigned int i = genesPerSet; i < totalGenes; i++) {
// geneIDs.push_back(randomGenes.at(i - genesPerSet));
// isOriginal.push_back(false);
// value.push_back(fabs(expressionLevels.getValue(geneIDs[i], statType)));
//}
//totalGenes = geneIDs.size();
/**
* calculate the scores
*/
/**
* calculate Ua, Ub and U
*/
long U_Original = 0, U_Random = 0, U_Final;
//for (int j = 0; j < genesPerSet; j++) // j in 1 set=Ori. count how many Ori are less than this Rnd
//{
// U_Original += score[j];
//}
//for (unsigned int j = genesPerSet; j < totalGenes; j++) // j in 2 set=Rnd, count how many Rnd are less than this Ori
//{
// U_Random += score[j];
//}
if (statType == Fold_Change || statType == T_Statistic || statType == Z_Statistic)
{
// Higher value is a winner
for (size_t i = 0; i < genesPerSet /*totalGenes*/; i++) // i - 2x
{ //tempScore = 0;
//if (!isOriginal[i]) // i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] > valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are less than this Rnd
U_Original+= (valueOri[i] > valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
}
//} else
//{
// for (unsigned int j = genesPerSet; j < totalGenes; j++) // i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
// { if (value.at(i) > value.at(j)) { tempScore++; }
// }
//}
//score.push_back(tempScore);
}
} else
if (statType == FDR_PValue || statType == PValue)
{
// Lower value is a winner
for (size_t i = 0; i < genesPerSet; i++)
{
for (size_t j = 0; j < genesPerSet; j++)
{ U_Random += (valueRnd[i] < valueOri[j]);// i en 2 set=Rnd, j in 1 set=Ori. count how many Ori are > than this Rnd
U_Original+= (valueOri[i] < valueRnd[j]);// i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are > than this Ori
}
//} else
//{
// for (unsigned int j = genesPerSet; j < totalGenes; j++) // i in 1 set=Ori, j in 2 set=Rnd, count how many Rnd are less than this Ori
// { if (value.at(i) > value.at(j)) { tempScore++; }
// }
//}
//score.push_back(tempScore);
}
//for (unsigned int i = 0; i < totalGenes; i++)
//{ tempScore = 0;
// if (!isOriginal[i])
// { for (int j = 0; j < genesPerSet; j++) {
// if (value.at(i) < value.at(j)) { // Rnd i < Ori j increm U_Random
// tempScore++;
// }
// }
// } else {
// for (unsigned int j = genesPerSet; j < totalGenes; j++) { // Ori i < Rnd j. Increm U_Original
// if (value.at(i) < value.at(j)) {
// tempScore++;
// }
// }
// }
// score.push_back(tempScore);
//}
} else { cout << endl << "ERROR. Statistic type not defined." << endl; }
U_Final = (U_Original < U_Random) ? U_Original : U_Random;
/**
* calculate z
*/
double Zn, Zd, Z;
Zn = U_Final - ((genesPerSet * genesPerSet) / 2);
Zd = sqrt(
(double) (((genesPerSet * genesPerSet
* (genesPerSet + genesPerSet + 1)))) / 12.0);
Z = Zn / Zd;
/**
* Return 0/1/2
* 2: p value < 0.01
* 1: 0.01 < p value < 0.05
* 0: p value > 0.05
*/
if (fabs(Z) > 2.303)
return 2;
else if (fabs(Z) > 1.605)
return 1;
else
return 0;
}