I'm writing a script in python that will replace email address and IP address with random characters in a log file.
Right now one line is read, the substitution for email address is done, and then the same line is read again to do the substitution for the IP address.
I want to do both in one go, but cannot find a way to do so.
I read the post here, but that seems to work only for simple substitutions. In this case, I'm substituting a (relatively) complex regex with a function, and can't figure out a way to do it in one line.
This is the original code:
import re
import hashlib
def hashing_func(to_hash):
''' Calculates hash '''
return hashlib.sha256(to_hash).hexdigest()
def hashing_func_email(username, domain):
''' Creates a separate hash for username and
domain. Appends 'EM_' to show it's an email.
Reduces hash length to make it more readable.'''
username_hash = hashing_func(username)
domain_hash = hashing_func(domain)
return 'EM_' + username_hash[:13] + '@' + domain_hash[:10]
def hashing_func_ipaddr(ipaddr):
''' Creates a hash for IP address, and appends 'IP_'.
Reduces length to make it more readable.'''
ipaddr_hash = hashing_func(ipaddr)
return 'IP_' + ipaddr_hash[:11]
def main():
email_regex = re.compile(r'''(
[a-zA-Z0-9._+-]+)
(@|%40)
([a-zA-Z0-9.-]+
(\.[a-zA-Z0-9]{2,4})
)''', re.VERBOSE)
ipaddr_regex = re.compile(r'''\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}
(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b
''', re.VERBOSE)
old_file = ["10.38.1.2, user@example.com, was here",
"This user2@example.com tried from 84.12.41.53 again"]
for line in old_file:
new_line = re.sub(
email_regex, lambda x: hashing_func_email(
x.group(1), x.group(3)), line)
new_line_ip = re.sub(
ipaddr_regex, lambda x: hashing_func_ipaddr(
x.group(0)), new_line)
print new_line_ip
if __name__ == '__main__':
main()