0

I have a container (array or vector) and millions of words. I need to sort them in following order s.

The primary sort order should be the number of characters in the word. The secondary sort order should be lexicographical. I can not use any library such as sort. I want to create the algorithms from scratch. I appreciate if anyone can hit me up with any reference.

So sorting the words:

This is a list of unsorted words

should give:

a is of This list words unsorted

Edit:

I am not allowed to use any STL such as sort

//Following is my final program
//It wi be run with following:  args: <inputfile> <outputfile> <timesfile> <ntests>  
//timesfile is for storing times and ntests is for number of test
/*
Bernard Grey
10 Wednesday 10 Sep 2014
*/
#include <iostream>
#include <ctime>
#include <algorithm>
#include <fstream>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <vector>

using namespace std;

//This node contain two type of information both in the vector
//First is vector for hash function. it contains number of repetition of the word
//Second node contain a word for values in my vector and the other field is for future implementation ;)
struct node
{
    string val;
    int count;
};

//Definition of inner and outer vectors as cintainer of words and hash table
typedef std::vector<node> StringVector;
typedef std::vector<StringVector> StringVector2D;




//Cited at http://stackoverflow.com/questions/8317508/hash-function-for-a-string :In the comment
int HashTable (string word)
{
   int seed = 378551; 
   unsigned long hash = 0;
   for(int i = 0; i < word.length(); i++)
   {
      hash = (hash * seed) + word[i];
   }
   return hash % 1000000;//Later assign it to number of words
}

//Cite at: http://stackoverflow.com/questions/25726530/how-to-find-an-struct-element-in-a-two-dimention-vector
struct find_word
{
    string val;
    find_word(string val) : val(val) {}
    bool operator () ( const node& m ) const
    {
        return m.val == val;
    }
};


//I could use swap function in vector instead of implementing this function
void swap(StringVector& vec, int i, int j)
{
    node tmp = vec[i];
    vec[i] = vec[j];
    vec[j] = tmp;

}

//To compare string alphabetically order
bool comp(node& i,node& p)
{
    int cmp;
    if(i.val.compare(p.val)<0)
    {
        return true;
    }
    return false;
}

void quickSort(StringVector& aVec, int left, int right);
int partition(StringVector& aVec, int left, int right);
void swap(StringVector& aVec, int left, int right);


void quickSort(StringVector& aVec, int left, int right)
{
    if(right>0){
        int index = partition(aVec,left,right);

        if (left<index-1) {

            quickSort(aVec, left, index-1);

        }
        if (index<right) {
            quickSort(aVec, index,right);
        }
    }    
}
int partition(StringVector& aVec, int left, int right)
{

    string pivotNode;

         pivotNode = aVec[(left+right)/2].val;

    while (left<=right) { 

        while (aVec[left].val.compare(pivotNode)<0) {left++;  }  
        while (aVec[right].val.compare(pivotNode)>0) {right--;  }

        if (left<=right) {

           swap(aVec,left,right);
           left++;
           right--;
        }
    }
    return left;
}

//Welcome to Maaaain
int main(int argc, char* argv[])
{

    /*file reading and preprocessing*/
    if(argc != 5)
    {
        cerr << "usage: " << argv[0]  << " infile outfile timesfile ntests" << endl;
    }

    ifstream fin(argv[1]);
    if(fin.fail())
    {
        cerr << "Error: failed to open file " << argv[1]  << " for input" << endl;
        exit(EXIT_FAILURE);
    }
    int ntests = atoi(argv[4]);
    //Len of string and max num word
    int stringlen, numwords;
    get_max_words(fin, stringlen, numwords);
    //initial string
    string init[numwords];


    //Read the file and add it to first array
    for(int i=0; i<numwords; i++)
    {

        string tmp;
        fin >> tmp;
        int len = tmp.length();
        //There is one single ' in the example output file. so I do not want to delete that one :-)
        bool pp = true;
        //Remove punct from leading and tail
        if(len==1)
        {
            pp=false;
        }
        //Remove punc
        if( ispunct(tmp[0]) && pp)
        {
            tmp.erase(0,1);
        }
        //Remove punc
        if( ispunct(tmp[len-1]) && pp)
        {
            tmp.erase(len-1,1);
        }
        init[i] =tmp;
    }

    /*
    At this point, everything should be in the initial array
    The temporary array should be declared but not filled
    */

    clockid_t cpu;
    timespec start, end;
    long time[ntests];
    //2 Dimension vector this will called outer vector
    StringVector2D twoD;



    if(clock_getcpuclockid(0, &cpu) != 0)
    {
        cerr << "Error: could not get cpu clock" << endl;
        exit(EXIT_FAILURE);
    }
    int rep = 0;


    node tmp;
    tmp.count = 0;
    tmp.val = "";
    //Later I need to assign it to number of words * M ... Good for encryption... It is not a security subject
    vector<node> first(1000000,tmp);
    //This is called inner vector
    vector<string> templateVec;
    //Last search?
    bool last = false;

    //Initialize inner map as needed and put it inside the outer vector with no data
    for(int f=0;f<(stringlen);f++)
    {   

        StringVector myVec;
        twoD.push_back(myVec);
    }


    for(int i=0; i<ntests; i++)
    {   

        if(clock_gettime(cpu, &start) == -1)
        {
            cerr << "Error: could not get start time" << endl;
            exit(EXIT_FAILURE);
        }


        //Check if it is last iteration so do not delete data for printing purposeses
        if(i == ntests-1)
        {
            last = true;
        }

        /*copy from initial array to temporary array*/
        //Initialize inner vector with the values. In this point outer vector is filled with inner vector
        //&&&  inner vector is empty  myvec.empty() = true;
        //vector at index 0 is for words with one char... vector 1 is for words with two chars and so on...
        for(int j=0; j<numwords; j++)
        {
            int len = init[j].length()-1;
            if(len<0)continue;

            //Initilize a node to fill up the vector
            node currNode;
            currNode.val = init[j];
            //currNode.count = 0;           

            int hash  =  HashTable(init[j]);
            //Node already existed
            if(first[hash].count != 0){
                //Add to its value in hash table
                first[hash].count++;
            }
            else
            {
                //Activate word first time!
                first[hash].count =1;
                //I can even not use this because of the hash table but it may help in future improvment!!!
                first[hash].val = init[j];
                //Add the word to appropriate level in outer string! 1char == [0] ---  2char== [1] so on
                twoD[len].push_back(currNode);
            }   
        }
        //Sort Alphabetically order
        for(int f=0;f<(stringlen);f++)
        {
            //Eficcient sorting algorithm with no chance of segmentation dump ;)
            quickSort(twoD[f],0,twoD[f].size()-1);          
        }
        //Time finished
        if(clock_gettime(cpu, &end) == -1)
        {
            cerr << "Error: could not get end time" << endl;
            exit(EXIT_FAILURE);
        }

        //Delete items from vector if it is not last iteration --- This is not part of sorting algorithm so it is after clock
        if(!last)
        {
            for(int f=0;f<stringlen;f++)
            {
                twoD[f].clear();
            }
            twoD.clear();

            for(StringVector::iterator it3 = first.begin();it3!=first.end();it3++)
            {
                it3->val="";
                it3->count=0;
            }

            //Initialize inner map as needed and put it inside the outer vector 
            for(int f=0;f<(stringlen);f++)
            {
                StringVector myVec;
                twoD.push_back(myVec);
            }           
        }

        /*time per trial in nanoseconds*/
        time[i] = (end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec;
    }


    /*output sorted temporary array*/
    int k=0;
    int y =0;
    int num=0;
    ofstream fout(argv[2]); 
    //Pointer for inner vector
    StringVector::iterator it2;
    for (StringVector2D::iterator outer = twoD.begin();  outer != twoD.end();  ++outer){
        y++;
        k=0;
        for (it2= outer->begin(); it2!=outer->end(); ++it2){
            //Get back data from hash table
            int hash  =  HashTable(it2->val);
            //Number of word in other field of the node
            int repWord = first[hash].count;
            //Print according to that
            for(int g=0; g < repWord ;g++){
                    num++;
                    //10 char in one line
                    if(num%10 == 0)
                    {
                        fout << it2->val;
                        fout<<endl;
                        k++;
                    }
                    else
                    {
                        fout<< it2->val << "  ";
                    }
                }
            }
        }
    //Sort times with STL for god sake....
    sort(time,time+ntests);
    //print times to the file///
    ofstream ftimes(argv[3]);
    for(int i=0; i<ntests; i++)
        ftimes << time[i] << endl;
}



//Helper function .. nice job
void get_max_words(ifstream& fin, int& wordlen, int& numwords)
{
    char c;
    int count=0;
    wordlen = numwords = 0;

    while(fin.good() && fin.get(c) && isspace(c)){;} //skip leading space
    while(fin.good())
    {
        ++numwords;
        while(fin.good() && !isspace(c)) 
        {
            ++count;
            fin.get(c);
        }
        if(count > wordlen)
            wordlen = count;
        count = 0;
        while(fin.good() && fin.get(c) && isspace(c)){;} //skip space
    }   
    if(count > wordlen)
        wordlen = count;
    fin.clear();
    fin.seekg(0, ios::beg);
}
CiaPan
  • 9,381
  • 2
  • 21
  • 35
Bernard
  • 4,240
  • 18
  • 55
  • 88
  • 2
    1) Re-implement a general-purpose sorting function which accepts a general comparator. 2) Pass an appropriately constructed comparator (trivial). — There is no reason at all to conflate these two problems into one. – Konrad Rudolph Sep 02 '14 at 16:49
  • When you implement a sorting routine from scratch, you will come to the point where you must write the comparator. In your case, make the comparator compare lengths first, and only perform a lexical compare on words of the same length. (This is similar to what Konrad Rudolph said, but with the comparator embedded instead of called.) – A. I. Breveleri Sep 02 '14 at 17:09
  • @A.I.Breveleri I was thinking to put the array in multi dimension array. first dimension is words with one character and second row is words with two character and so on.. And then sort each row independently. Do you think it is good sufficient way? – Bernard Sep 02 '14 at 17:17

4 Answers4

3

You'll primarily need a comparator for your sort routine to sort on:

bool lessThan(const std::string a, const std::string b) {
    if (a.length() != b.length()) 
         return a.length() < b.length();
    return a < b;
}
Paul Evans
  • 27,315
  • 3
  • 37
  • 54
  • 1
    OP doesn't want `sort` to be present since he 's not allowed to use library. – nevets Sep 02 '14 at 16:58
  • @nevets i said "your `sort`" meaning the OP's – Paul Evans Sep 02 '14 at 17:00
  • 1
    @nevets this answer doesn't place any restrictions on how the given function should be used. Yes it's usable for `std::sort` but it should be just as usable in a home-grown sorting function. – Mark Ransom Sep 02 '14 at 17:00
  • @MarkRansom Do you know any example of home-grown sorting function combined with this. I need a little bit more hint. – Bernard Sep 02 '14 at 17:04
  • 1
    @Bernard at some point in *any* sorting routine, two elements have to be compared to **see** which one goes **first**, hence `if (lessThan(a, b)) { // ...` – Paul Evans Sep 02 '14 at 17:07
  • I am looking for best efficient way. Do you think if I put the array in multi dimension array. first dimension is words with one character and second row is words with two character and so on.. And then sort each row independently. Do you think it is good sufficient way? – Bernard Sep 02 '14 at 17:20
  • @Bernard no, use the above comparator to **order** your strings. **Choose** the **most efficient** sort algorithm to impliment from your books, the web, *etc* – Paul Evans Sep 02 '14 at 17:40
1

It's about sorting based on multiple keys. I suggest you study some efficient sorting algorithm, say Quick Sort, then change the comparator to adapt the multiple keys.

For any sorting algorithm that is based on comparing, the easiest way to adapt multiple key sorting is to change the comparing criteria, from a single value to multiple values.

If you are not even allowed to use STL, i.e. you are not allowed to use sort in , here is a post you can start with: Sorting an array using multiple sort criteria (QuickSort)

If you are allowed, just write a comparing function which supports the multiple key comparison and plug it in the sort function. You can check this C++ reference for more details.

An illustration (it's just an illustration to point out how you can plug in the compare function):

bool comparator(const string& a, const string& b) {
    if (a.length() < b.length())
        return true;
    if (a.length() > b.length())
        return false;

    return a < b;
}

void Qsort(string a[],int low,int high)
{
    if(low >= high)
    {
        return;
    }

    int left = low;
    int right = high;
    string key = a[(low + high) >> 1];

    while(left < right)
    {
        while(left < right && comparator(a[left], key)) left++;     
        while(left < right && !comparator(a[right], key)) right--;

        if (left < right)
        {
             swap(a[left], a[right]);
             left++; right--;
        }
    }

    if (left == right) left ++;

    if (low < right) Qsort(a, low, left - 1);
    if (high > left) Qsort(a, right + 1, high);
}
Community
  • 1
  • 1
nevets
  • 4,631
  • 24
  • 40
  • Thanks. Link is helping a lot. Unfortunately, I am not allowed to use STL. – Bernard Sep 02 '14 at 17:01
  • 1
    @Bernard check the SO thread I posted in my answer, it's quite useful in your case :) – nevets Sep 02 '14 at 17:07
  • Your answer looks complete. I will implement it and finger cross if it works I accept your answer as solved very soon. Thanks. – Bernard Sep 02 '14 at 17:27
1

There's actually an easy way to implement this in stl. There's a sort method that takes a comparator:

template <class RandomAccessIterator, class Compare>
  void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp);

So you can do this:

bool comparator(const string& a, const string& b) {
    if (a.length() < b.length())
        return true;
    if (a.length() == b.length())
        return a < b;
    return false;
}

sort(words.begin(), words.end(), comparator);
ldgabbay
  • 1,133
  • 8
  • 9
1

The answer wants a design, so I'll focus on the design of your sorting library, than an implementation

Your sort algorithm can use your custom comparator objects with a member operator() implemented for comparison between two elements.

Your comparator can be a Linked List of comparators and can call the next comparator if the current one gives a tie. You'll have to ensure that there is always a true and false return though. Or implement something that can create a stable_sort if nothing else.

So the first comparator is number of characters and the second comparator is lexicographical.. This idea is then general enough so that if your requirement changes tomorrow. This can then be reused.

This is on the lines of Chain of Responsibility Pattern. You can templat-ize the comparator after you've got the gist.

Ex:

class Chain_Comparator
{
    Chain_Comparator* next;
public:
     bool operator()( void* a, void* b )
     {
          if( a_is_less_b(a, b) )
              return true;
          else if( b_is_less_a(a,b) )
              return false;
          else if( next )
              return next( a, b )    
     }    
    virtual bool a_is_less( void* a, void* b) = 0;
    virtual bool b_is_less( void* a, void* b) = 0;
 };

class Num_Comparator : public Chain_Comparator
{
    // Implements a_is_less etc.
};
class Lex_Comparator : public Chain_Comparator
{
  // Implement lex comparisons.
};

void your_custom_sorting_method( vector<int > a, Chain_Comparator& c)
{
// Implementation goes here.
//  call the operator() for c with simply : c( a[i], a[j] )

}
Kshitij Banerjee
  • 1,678
  • 1
  • 19
  • 35