How about this (using the STL, comparators and functors)?
NOTE: All assumptions and explanations are in the source code itself.
#include <iostream>
#include <string>
#include <vector>
#include <cstdlib>
#include <sstream>
#include <algorithm>
#include <cctype>
#include <utility>
#include <string.h>
bool compare (const std::pair<int, int>& l, const std::pair<int, int>& r) {
return l.first < r.first;
}
//functor/unary predicate:
struct CompareFirst {
CompareFirst(int val) : val_(val) {}
bool operator()(const std::pair<int, int>& p) const {
return (val_ == p.first);
}
private:
int val_;
};
int main() {
char delims[] = ".,!?";
char noise[] ="-'";
//I'm assuming you've read the text from some file, and that information has been stored in a string. Or, the information is a string (like below):
std::string input = "This is fun-\nny, Mr.P and I've never seen\nthis ice-cream flavour\nbefore.Crazy eh?\n#\nThis is fun-\nny! Mr.P and I've never seen\nthis ice-cream flavour\nbefore.Crazy eh?\n#\n";
std::istringstream iss(input);
std::string temp;
//first split the string by #
while(std::getline(iss, temp, '#')) {
//find all the occurences of the hypens as it crosses lines, and remove the newline:
std::string::size_type begin = 0;
while(std::string::npos != (begin = temp.find('-', begin))) {
//look at the character in front of the current hypen and erase it if it's a newline, if it is - remove it
if (temp[begin+1] == '\n') {
temp.erase(begin+1, 1);
}
++begin;
}
//now, erase all the `noise` characters ("'-") as these count as these punctuation count as zero
for (int i = 0; i < strlen(noise); ++i) {
//this replaces all the hyphens and apostrophes with nothing
temp.erase(std::remove(temp.begin(), temp.end(), noise[i]), temp.end());//since hyphens occur across two lines, you need to erase newlines
}//at this point, everything is dandy for complete substitution
//now try to remove any other delim chracters by replacing them with spaces
for (int i = 0; i < strlen(delims); ++i) {
std::replace(temp.begin(), temp.end(), delims[i], ' ');
}
std::vector<std::pair<int, int> > occurences;
//initialize another input stringstream to make use of the whitespace
std::istringstream ss(temp);
//now use the whitespace to tokenize
while (ss >> temp) {
//try to find the token's size in the occurences
std::vector<std::pair<int, int> >::iterator it = std::find_if(occurences.begin(), occurences.end(), CompareFirst(temp.size()));
//if found, increment count by 1
if (it != occurences.end()) {
it->second += 1;//increment the count
}
//this is the first time it has been created. Store value, and a count of 1
else {
occurences.push_back(std::make_pair<int, int>(temp.size(), 1));
}
}
//now sort and output:
std::stable_sort(occurences.begin(), occurences.end(), compare);
for (int i = 0; i < occurences.size(); ++i) {
std::cout << occurences[i].first << " " << occurences[i].second << "\n";
}
std::cout << "\n";
}
return 0;
}
91 lines, and all vanilla C++98.
A rough outline of what I did is:
- Since hyphens occur across two lines, find all hyphens and remove any newlines that follow them.
- There are characters that don't add to the length of a word such as the legitimate hypenated words and the apostrophe. Find these and erase them as it makes tokenizing easier.
- All the other remaining delimiters can now be found and replaced with whitespace. Why? Because we can use the whitespace to our advantage by using streams (whose default action is to skip whitespace).
- Create a stream and tokenize the text via whitespace as per the previous.
- Store the lengths of the tokens and their occurrences.
- Sort the lengths of the tokens, and then output the token length and corresponding occurrences.
REFERENCES:
https://stackoverflow.com/a/5815875/866930
https://stackoverflow.com/a/12008126/866930