2

I wrote the code below. My sentence is part of Twitter. I want to remove all emojis from my list, but my emoji function does not work. Why?

And also I want to remove users. Users start from the beginning of a sentence, but sometimes it keeps users and sometimes it removes users. Also my punctuation does not work and I commented it. How can I fix that?

import spacy, re

nlp = spacy.load('en')

stop_words = [w.lower() for w in stopwords.words()]

def sanitize(input_string):
    """ Sanitize one string """

  # Remove emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    
    string = emoji_pattern.sub(r'', input_string) # No emoji

    # Normalize to lowercase 
    string = input_string.lower()

    # Spacy tokenizer 
    string_split = [token.text for token in nlp(string)]
    

    # In case the string is empty 
    if not string_split:
        return '' 

    # Remove user

    # Assuming user is the first word and contains an @
    if '@' in string_split[0]:
        del string_split[0]

    # Join back to string 
    string = ' '.join(string_split)

    # Remove # and @
    for punc in '":!@#':
       string = string.replace(punc, '')

    # Remove 't.co/' links
    string = re.sub(r'http//t.co\/[^\s]+', '', string, flags=re.MULTILINE)

    # Removing stop words 
    string = ' '.join([w for w in string.split() if w not in stop_words])

#Punctuation

   # string = [''.join(w for w in string.split() if w not in string.punctuation) for w in string]





    # return string 





#list = ['@cosmetic_candy I think a lot of people just enjoy being a pain in the ass on there',

 'Best get ready sunbed and dinner with nana today :)',

 '@hardlyin70 thats awesome!',

 'Loving this weather',

 '“@danny_boy_37: Just seen an absolute idiot in shorts! Be serious!” Desperado gentleman',

 '@SamanthaOrmerod trying to resist a hardcore rave haha! Resisting towns a doddle! Posh dance floor should wear them in quite easy xx',

 '59 days until @Beyonce!!! Wooo @jfracassini #cannotwait',

 'That was the dumbest tweet I ever seen',

 'Oh what to do on this fine sunny day?',

 '@Brooke_C_X hows the fish ? Hope they r ok. Xx',

 '@Jbowe_ ',

 'Or this @louise_munchi',

 '@guy_clifton your diary is undoubtedly busier than mine, but feel free to check ',

 'Willy⚽']



list_sanitized = [sanitize(string) for string in list]

list_sanitized[:50]


Peter Mortensen
  • 30,738
  • 21
  • 105
  • 131
eli
  • 184
  • 2
  • 12

1 Answers1

4

I'm drawing on some other SO answers here:

This will also remove any Twitter username wherever it appears in the string.

import emoji
import spacy
import stop_words

nlp = spacy.load('en_core_web_sm')

stopwords = [w.lower() for w in stop_words.get_stop_words('en')]

emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

def give_emoji_free_text(text): 
    return emoji.get_emoji_regexp().sub(r'', text)

def sanitize(string):
    """ Sanitize one string """

    # remove graphical emoji
    string = give_emoji_free_text(string) 

    # remove textual emoji
    string = re.sub(emoticon_string,'',string)

    # normalize to lowercase 
    string = string.lower()

    # spacy tokenizer 
    string_split = [token.text for token in nlp(string)]

    # in case the string is empty 
    if not string_split:
        return '' 

    # join back to string 
    string = ' '.join(string_split)

    # remove user 
    # assuming user has @ in front
    string = re.sub(r"""(?:@[\w_]+)""",'',string)

    #remove # and @
    for punc in '":!@#':
       string = string.replace(punc, '')

    # remove 't.co/' links
    string = re.sub(r'http//t.co\/[^\s]+', '', string, flags=re.MULTILINE)

    # removing stop words 
    string = ' '.join([w for w in string.split() if w not in stopwords])

    return string
mechanical_meat
  • 163,903
  • 24
  • 228
  • 223