I have a container (array or vector) and millions of words. I need to sort them in following order s.
The primary sort order should be the number of characters in the word. The secondary sort order should be lexicographical. I can not use any library such as sort. I want to create the algorithms from scratch. I appreciate if anyone can hit me up with any reference.
So sorting the words:
This is a list of unsorted words
should give:
a is of This list words unsorted
Edit:
I am not allowed to use any STL
such as sort
//Following is my final program
//It wi be run with following: args: <inputfile> <outputfile> <timesfile> <ntests>
//timesfile is for storing times and ntests is for number of test
/*
Bernard Grey
10 Wednesday 10 Sep 2014
*/
#include <iostream>
#include <ctime>
#include <algorithm>
#include <fstream>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <vector>
using namespace std;
//This node contain two type of information both in the vector
//First is vector for hash function. it contains number of repetition of the word
//Second node contain a word for values in my vector and the other field is for future implementation ;)
struct node
{
string val;
int count;
};
//Definition of inner and outer vectors as cintainer of words and hash table
typedef std::vector<node> StringVector;
typedef std::vector<StringVector> StringVector2D;
//Cited at http://stackoverflow.com/questions/8317508/hash-function-for-a-string :In the comment
int HashTable (string word)
{
int seed = 378551;
unsigned long hash = 0;
for(int i = 0; i < word.length(); i++)
{
hash = (hash * seed) + word[i];
}
return hash % 1000000;//Later assign it to number of words
}
//Cite at: http://stackoverflow.com/questions/25726530/how-to-find-an-struct-element-in-a-two-dimention-vector
struct find_word
{
string val;
find_word(string val) : val(val) {}
bool operator () ( const node& m ) const
{
return m.val == val;
}
};
//I could use swap function in vector instead of implementing this function
void swap(StringVector& vec, int i, int j)
{
node tmp = vec[i];
vec[i] = vec[j];
vec[j] = tmp;
}
//To compare string alphabetically order
bool comp(node& i,node& p)
{
int cmp;
if(i.val.compare(p.val)<0)
{
return true;
}
return false;
}
void quickSort(StringVector& aVec, int left, int right);
int partition(StringVector& aVec, int left, int right);
void swap(StringVector& aVec, int left, int right);
void quickSort(StringVector& aVec, int left, int right)
{
if(right>0){
int index = partition(aVec,left,right);
if (left<index-1) {
quickSort(aVec, left, index-1);
}
if (index<right) {
quickSort(aVec, index,right);
}
}
}
int partition(StringVector& aVec, int left, int right)
{
string pivotNode;
pivotNode = aVec[(left+right)/2].val;
while (left<=right) {
while (aVec[left].val.compare(pivotNode)<0) {left++; }
while (aVec[right].val.compare(pivotNode)>0) {right--; }
if (left<=right) {
swap(aVec,left,right);
left++;
right--;
}
}
return left;
}
//Welcome to Maaaain
int main(int argc, char* argv[])
{
/*file reading and preprocessing*/
if(argc != 5)
{
cerr << "usage: " << argv[0] << " infile outfile timesfile ntests" << endl;
}
ifstream fin(argv[1]);
if(fin.fail())
{
cerr << "Error: failed to open file " << argv[1] << " for input" << endl;
exit(EXIT_FAILURE);
}
int ntests = atoi(argv[4]);
//Len of string and max num word
int stringlen, numwords;
get_max_words(fin, stringlen, numwords);
//initial string
string init[numwords];
//Read the file and add it to first array
for(int i=0; i<numwords; i++)
{
string tmp;
fin >> tmp;
int len = tmp.length();
//There is one single ' in the example output file. so I do not want to delete that one :-)
bool pp = true;
//Remove punct from leading and tail
if(len==1)
{
pp=false;
}
//Remove punc
if( ispunct(tmp[0]) && pp)
{
tmp.erase(0,1);
}
//Remove punc
if( ispunct(tmp[len-1]) && pp)
{
tmp.erase(len-1,1);
}
init[i] =tmp;
}
/*
At this point, everything should be in the initial array
The temporary array should be declared but not filled
*/
clockid_t cpu;
timespec start, end;
long time[ntests];
//2 Dimension vector this will called outer vector
StringVector2D twoD;
if(clock_getcpuclockid(0, &cpu) != 0)
{
cerr << "Error: could not get cpu clock" << endl;
exit(EXIT_FAILURE);
}
int rep = 0;
node tmp;
tmp.count = 0;
tmp.val = "";
//Later I need to assign it to number of words * M ... Good for encryption... It is not a security subject
vector<node> first(1000000,tmp);
//This is called inner vector
vector<string> templateVec;
//Last search?
bool last = false;
//Initialize inner map as needed and put it inside the outer vector with no data
for(int f=0;f<(stringlen);f++)
{
StringVector myVec;
twoD.push_back(myVec);
}
for(int i=0; i<ntests; i++)
{
if(clock_gettime(cpu, &start) == -1)
{
cerr << "Error: could not get start time" << endl;
exit(EXIT_FAILURE);
}
//Check if it is last iteration so do not delete data for printing purposeses
if(i == ntests-1)
{
last = true;
}
/*copy from initial array to temporary array*/
//Initialize inner vector with the values. In this point outer vector is filled with inner vector
//&&& inner vector is empty myvec.empty() = true;
//vector at index 0 is for words with one char... vector 1 is for words with two chars and so on...
for(int j=0; j<numwords; j++)
{
int len = init[j].length()-1;
if(len<0)continue;
//Initilize a node to fill up the vector
node currNode;
currNode.val = init[j];
//currNode.count = 0;
int hash = HashTable(init[j]);
//Node already existed
if(first[hash].count != 0){
//Add to its value in hash table
first[hash].count++;
}
else
{
//Activate word first time!
first[hash].count =1;
//I can even not use this because of the hash table but it may help in future improvment!!!
first[hash].val = init[j];
//Add the word to appropriate level in outer string! 1char == [0] --- 2char== [1] so on
twoD[len].push_back(currNode);
}
}
//Sort Alphabetically order
for(int f=0;f<(stringlen);f++)
{
//Eficcient sorting algorithm with no chance of segmentation dump ;)
quickSort(twoD[f],0,twoD[f].size()-1);
}
//Time finished
if(clock_gettime(cpu, &end) == -1)
{
cerr << "Error: could not get end time" << endl;
exit(EXIT_FAILURE);
}
//Delete items from vector if it is not last iteration --- This is not part of sorting algorithm so it is after clock
if(!last)
{
for(int f=0;f<stringlen;f++)
{
twoD[f].clear();
}
twoD.clear();
for(StringVector::iterator it3 = first.begin();it3!=first.end();it3++)
{
it3->val="";
it3->count=0;
}
//Initialize inner map as needed and put it inside the outer vector
for(int f=0;f<(stringlen);f++)
{
StringVector myVec;
twoD.push_back(myVec);
}
}
/*time per trial in nanoseconds*/
time[i] = (end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec;
}
/*output sorted temporary array*/
int k=0;
int y =0;
int num=0;
ofstream fout(argv[2]);
//Pointer for inner vector
StringVector::iterator it2;
for (StringVector2D::iterator outer = twoD.begin(); outer != twoD.end(); ++outer){
y++;
k=0;
for (it2= outer->begin(); it2!=outer->end(); ++it2){
//Get back data from hash table
int hash = HashTable(it2->val);
//Number of word in other field of the node
int repWord = first[hash].count;
//Print according to that
for(int g=0; g < repWord ;g++){
num++;
//10 char in one line
if(num%10 == 0)
{
fout << it2->val;
fout<<endl;
k++;
}
else
{
fout<< it2->val << " ";
}
}
}
}
//Sort times with STL for god sake....
sort(time,time+ntests);
//print times to the file///
ofstream ftimes(argv[3]);
for(int i=0; i<ntests; i++)
ftimes << time[i] << endl;
}
//Helper function .. nice job
void get_max_words(ifstream& fin, int& wordlen, int& numwords)
{
char c;
int count=0;
wordlen = numwords = 0;
while(fin.good() && fin.get(c) && isspace(c)){;} //skip leading space
while(fin.good())
{
++numwords;
while(fin.good() && !isspace(c))
{
++count;
fin.get(c);
}
if(count > wordlen)
wordlen = count;
count = 0;
while(fin.good() && fin.get(c) && isspace(c)){;} //skip space
}
if(count > wordlen)
wordlen = count;
fin.clear();
fin.seekg(0, ios::beg);
}