I am running python code on data-bricks to clean the text.
Some text has values like this "环境ä¸å¥½ä½¿ç”¨
which I wan't to remove.
Here is the code:
def docs_preprocessor(docs):
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
# print(docs[idx])
docs[idx] = " ".join(w.lower() for w in nltk.wordpunct_tokenize(docs[idx]) if w.lower() in words or not w.isalpha())
docs[idx] = ' '.join(s for s in docs[idx].split() if not any(c.isdigit() for c in s))
# print(docs[idx])
docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.
# print(docs[idx])
# docs[idx] = docs[idx].lower() # Convert to lowercase.
# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isdigit()] for doc in docs]
docs = [[token.strip("_") for token in doc ] for doc in docs]
# Remove words that are only one character
docs = [[token for token in doc if len(token) > 3] for doc in docs]
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
docs = [" ".join(doc) for doc in docs]
return docs
But I am getting the error as:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 4: ordinal not in range(128)
I tried fixing this using this link: UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
But it didn't work.
When I checked what is the python version in Databricks:
from platform import python_version
print(python_version())
2.7.12