#!/usr/bin/env python
# -*- coding: utf_8 -*-
def splitParagraphIntoSentences(paragraph):
''' break a paragraph into sentences
and return a list '''
import re
# to split by multile characters
# regular expressions are easiest (and fastest)
sentenceEnders = re.compile('[.!?][\s]{1,2}(?=[A-Z])')
sentenceList = sentenceEnders.split(paragraph, re.UNICODE)
return sentenceList
if __name__ == '__main__':
p = "While other species (e.g. horse mango, M. foetida) are also grown ,Mangifera indica – the common mango or Indian mango – Sheffield’s only mango tree is valued at £9.2 billion."
sentences = splitParagraphIntoSentences(p)
for s in sentences:
print s.strip()
Expected Output: While other species (e.g. horse mango, M. foetida) are also grown ,Mangifera indica – the common mango or Indian mango – Sheffield’s only mango tree is valued at £9.2 billion."
Output Recieved: While other species (e.g. horse mango, M. foetida) are also grown ,Mangifera ind ica – the common mango or Indian mango – Sheffield’s only mango tree is va lued at £9.2 billion.
Ignore the meaning of the sentence, the main point is it isn't able to acess special characters such as " - ", " £ ", " ’ " and others. I tried setting sitecustomize.py file and this code with other encodings such as ascii, utf-32, cp-500, iso8859_15 and utf-8 but wasn`t able to solve it. Sorry I am new to python. Thanx in advance for the help.