I'm writing a program to split the words contained in an hashtag.
For example I want to split the hashtags:
#Whatthehello #goback
into:
What the hello go back
I'm having troubles when using re.sub
with a functional argument.
The code I've written is:
import re,pdb
def func_replace(each_func):
i=0
wordsineach_func=[]
while len(each_func) >0:
i=i+1
word_found=longest_word(each_func)
if len(word_found)>0:
wordsineach_func.append(word_found)
each_func=each_func.replace(word_found,"")
return ' '.join(wordsineach_func)
def longest_word(phrase):
phrase_length=len(phrase)
words_found=[];index=0
outerstring=""
while index < phrase_length:
outerstring=outerstring+phrase[index]
index=index+1
if outerstring in words or outerstring.lower() in words:
words_found.append(outerstring)
if len(words_found) ==0:
words_found.append(phrase)
return max(words_found, key=len)
words=[]
# The file corncob_lowercase.txt contains a list of dictionary words
with open('corncob_lowercase.txt') as f:
read_words=f.readlines()
for read_word in read_words:
words.append(read_word.replace("\n","").replace("\r",""))
For example when using these functions like this:
s="#Whatthehello #goback"
#checking if the function is able to segment words
hashtags=re.findall(r"#(\w+)", s)
print func_replace(hashtags[0])
# using the function for re.sub
print re.sub(r"#(\w+)", lambda m: func_replace(m.group()), s)
The output I obtain is:
What the hello
#Whatthehello #goback
Which is not the output I had expected:
What the hello
What the hello go back
Why is this happening? In particular I've used the suggestion from this answer but I don't understand what goes wrong in this code.