I've been practicing machine learning on the task of restoring spaces in a joined text. Since I decided to use the dictionary feature, I searched the web for some ideas of splitting the text based on the dictionary, and I stumbled upon this idea. Based on it, I've written a script that converts the text without spaces to a vertical form needed by the ML tool:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from math import log
import string
import fileinput
words = open("dictionary.txt", encoding="utf8").read().split()
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def infer_spaces(s):
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s)+1):
#original script was pretty basic, so symbols\words not in dictionary
#broke the processing completely. This fixed the problem.
if s[i-1] not in wordcost:
wordcost[s[i-1]] = log((len(words) + 1)*log(len(words)))
c,k = best_match(i)
cost.append(c)
print(cost)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(s[i-k:i])
i -= k
return " ".join(reversed(out))
def char_type(s):
""" Character type function """
if s in string.punctuation:
return "P"
elif s in string.digits:
return "D"
elif s in string.ascii_letters:
return "F"
elif s.isupper():
return "U"
else:
return "R"
def test_to_vert(s):
"""
Transforms regular text into a vertical form.
"""
s = s.rstrip('\n')
orig_sent = s
a = s.lower().replace("ё", "е")
a = infer_spaces(a)
space_indices = []
a = list(a)
for i,k in enumerate(a):
if k == " ":
space_indices.append(i)
orig_sent = list(orig_sent)
for i in space_indices:
orig_sent.insert(i, " ")
orig_sent = "".join(orig_sent)
orig_sent = orig_sent.split(" ")
answer = []
for word in orig_sent:
i = 0
for letter in word:
answer.append(letter + "\t" + letter.lower() + "\t" + \
char_type(letter) + "\t" + str(i) + "|" + str(len(word)))
i += 1
return '\n'.join(answer)
testfile = open("head.txt", encoding="utf8")
output = open("test_python.txt", 'w', newline="\n", encoding="utf8")
for line in testfile:
if line in ['\n', '\r\n']:
output.write('\n')
else:
output.write(test_to_vert(line))
output.write('\n\n')
output.write('\n\n\n')
testfile.close()
output.close()
So far so good, it works. After that I decided to practice my Ruby (I'm relatively new to coding), so I tried to re-write the script (Ruby version):
#!/usr/bin/ruby
#encoding: UTF-8
Encoding::default_internal = "UTF-8"
Encoding::default_external = "UTF-8"
require 'active_support/core_ext'
@wordcost = Hash.new
@count = %x{wc -l dictionary.txt}.split.first.to_i
i = 0
File.readlines("dictionary.txt").each do |line|
line.chomp!
@wordcost[line.mb_chars.downcase.to_s] ||= Math.log((i+1) * Math.log(@count))
i += 1
end
def infer_spaces(s)
@sent = s.chomp
def best_match(i)
result = []
candidates = @cost[0, i].reverse
candidates.each_index do |index|
if @wordcost.has_key?(@sent[i-index-1...i].mb_chars.downcase.to_s)
result << [(candidates[index] + @wordcost[@sent[i-index-1...i].mb_chars.downcase.to_s]), (index + 1)]
else
result << [(candidates[index] + Float::INFINITY), (index + 1)]
end
end
result.sort!
return result[0][0], result[0][1]
end
@cost = [0]
for i in (1..@sent.length)
@wordcost[@sent[i-1].mb_chars.downcase.to_s] ||= Math.log(@count * Math.log(@count))
c, k = best_match(i)
@cost << c
end
out = []
i = @sent.length
while i>0
c, k = best_match(i)
if c != @cost[i]
raise "Something went wrong"
end
out << @sent[i-k...i]
i -= k
end
return out.reverse.join(" ")
end
def char_type(string)
case string
when /[[:punct:]]/
return "P"
when /[[:digit:]]/
return "D"
when /[A-z]/
return "F"
when /[[:upper:]]/
return "U"
else
return "R"
end
end
def test_to_vert(s)
s.chomp!
orig_sent = s
a = s.mb_chars.downcase.to_s
a = infer_spaces(a)
space_indices = []
a = a.split("")
a.each_index do |i|
if a[i] == " "
space_indices << i
end
end
orig_sent = orig_sent.split("")
space_indices.each do |x|
orig_sent.insert(x, " ")
end
orig_sent = orig_sent.join
orig_sent = orig_sent.split
answer = []
orig_sent.each do |word|
letters = word.split("")
letters.each_index do |i|
answer << letters[i] + "\t" + letters[i].mb_chars.downcase.to_s + \
"\t" + char_type(letters[i]) + "\t" + i.to_s + "|" + word.length.to_s
end
end
return answer.join("\n")
end
file = File.open('test_ruby_vert.txt', 'w')
File.readlines("test.txt").each do |line|
if line.chomp.empty?
file.write("\n")
else
file.write(test_to_vert(line))
file.write("\n\n")
end
end
file.close
The rewritten script works, however, it is really slow compared to the Python version (a ~40000-line text is processed in like not more than an hour, a Ruby script worked for hours for now, and it only processed like 15% of the text).
I wonder what could slow it down so much? Could it be that is because of the fact that i need to use "active_support/core_ext" to downcase Cyrillic text in Ruby? Could it be because I don't limit the processing in best_match using maxword? Maybe some other rewrite really messed the script up? Any insight will be really helpful for me.