Here's a method that returns the results btilly listed in their new answer, but limits the execution time for each result to 20 seconds (except for length 22, and there may be a couple of other outliers) :) ^^
The method uses bitsets and prioritises letter frequency in the word lists (hashed by a 201-bit bitset of letters to include duplicate letters), partitioning the relevant word list by including only word hashes that do not have the new excluded letter.
This is a hack to exit the recursion early because the greedy branches seems so skewed. I'd welcome ways to leverage the ideas here (studying btilly and David Eisenstat's answers more carefully notwithstanding).
from collections import defaultdict
import time
def as_number(word):
word = word.lower();
n = 0
for c in word:
m = ord(c) - 97
if n & (1 << m):
return 0
else:
n |= 1 << m
return n
def as_number_with_duplicates(word):
counts = defaultdict(lambda: 0)
word = word.lower();
n = 0
for c in word:
m = ord(c) - 97
n |= 1 << (counts[c] * 26 + m)
counts[c] += 1
return n
def get_letters(n):
letters = ""
i = 0
while n:
if n & 1:
letters += chr(97 + i % 26)
n >>= 1
i += 1
return letters
def add_to_freq(freq, bits):
i = 0
while bits:
if bits & 1:
freq[i] += 1
bits >>= 1
i += 1
def arrange_freq(freq):
return sorted([(x, i) for i, x in enumerate(freq)])
def get_tuple(tpl, i, k, seen, num_to_count_map):
bits = tpl[2]["bits"]
freq = [0] * len(tpl[2]["freq"])
nums = tpl[2]["nums"]
bit = 1 << i
new_bits = bits ^ bit
if new_bits in seen:
return None
seen.add(new_bits)
new_nums = []
count = 0
for num in nums:
if not (num & bit):
new_nums.append(num)
count += num_to_count_map[num]
add_to_freq(freq, num)
return (count, tpl[1] - 1, {
"bits": new_bits,
"freq": arrange_freq(freq),
"nums": new_nums
})
def upper_index(a, val, left, right):
if left == right:
return left
mid = (left + right + 1) // 2
if a[mid][0] > val:
return upper_index(a, val, left, mid-1)
else:
return upper_index(a, val, mid, right)
# tpl = (word_count, letter_count, {bits, freq, nums})
def solve(tpl, k, seen, num_to_count_map, global_best):
if tpl[1] == k:
print "cand: %s, %s, %s" % (get_letters(tpl[2]["bits"]), tpl[1], tpl[0])
if tpl[0] > global_best["count"]:
global_best["count"] = tpl[0]
global_best["bits"] = tpl[2]["bits"]
if tpl[1] > k:
freq = tpl[2]["freq"]
start = 1 + upper_index(freq, 0, 0, len(freq))
for idx in xrange(start, len(freq)):
if time.time() > global_best["end"]:
return
_freq, i = freq[idx]
new_tpl = get_tuple(tpl, i, k, seen, num_to_count_map)
# Prune
if not new_tpl or new_tpl[0] < global_best["count"]:
continue
solve(new_tpl, k, seen, num_to_count_map, global_best)
def f(words, k):
num_to_count_map = defaultdict(lambda: 0)
nums = set()
all_bits = 0
freq = []
for w in words:
if len(w) > k:
continue
num = as_number_with_duplicates(w)
nums.add(num)
num_bits = len(bin(num)[2:])
if len(freq) < num_bits:
freq.extend([0] * (num_bits - len(freq)))
add_to_freq(freq, num)
num_to_count_map[num] += 1
all_bits |= num
tpl = (len(words), bin(all_bits).count("1"), {"bits": all_bits, "freq": arrange_freq(freq), "nums": nums})
global_best = {
"count": -float('inf'),
"bits": 0,
"end": time.time() + (5*60 if k == 22 else 20)
}
solve(tpl, k, set([all_bits]), num_to_count_map, global_best)
return global_best
def format(w):
return "".join(sorted(w.lower()))
def are_equal(w1, w2):
return format(w1) == format(w2)
"""
import os
import datetime
path = "enable.txt"
words = []
with open(path) as file:
for values in file:
words.append(values.strip())
start = datetime.datetime.now()
for i in xrange(2, 31):
result = f(words, i)
formatted = format(get_letters(result["bits"]))
print result["count"], formatted
print are_equal(formatted, btilly[i])
print(datetime.datetime.now() - start)
"""
# Code by btilly
import re
import os
import datetime
path = "enable.txt"
words = []
with open(path) as file:
for values in file:
words.append(values.strip().lower())
key_count = {}
for word in words:
seen = {}
for letter in word:
if letter not in seen:
seen[letter] = 0
key = (letter, seen[letter])
if key not in key_count:
key_count[key] = 1
else:
key_count[key] += 1
seen[letter] += 1
KEYS = sorted(key_count.keys(), key=lambda key: -key_count[key])
#print(KEYS)
#print(len(KEYS))
KEY_POS = {}
for i in range(len(KEYS)):
KEY_POS[KEYS[i]] = i
KEY_POS[KEYS[i]] = i
# Now we will build a trie. Every node has a list of words, and a dictionary
# from the next letter farther in the trie.
# BUT TRICK:, we will map each word to a sequence of numbers, and those numbers
# will be indexes into KEYS. This allows us to use the fact that a second 'e' is
# unlikely to store that efficiently.
class Trie:
def __init__(self, path):
self.words = []
self.dict = {}
self.words_len = 0
self.count_words = 0
self.path = path
def add_word (self, word):
trie = self
poses = []
seen = {}
for letter in word:
if letter not in seen:
seen[letter] = 0
key = (letter, seen[letter])
poses.append(KEY_POS[(key)])
seen[letter] += 1
sorted_poses = sorted(poses);
for i in range(len(sorted_poses)):
trie.count_words += 1
pos = sorted_poses[i]
if pos not in trie.dict:
trie.dict[pos] = Trie(trie.path + KEYS[pos][0])
trie = trie.dict[pos]
trie.count_words += 1
trie.words_len += 1
trie.words.append(word)
def remove (self, pos):
trie = Trie(self.path)
trie.words = None
trie.dict = self.dict.copy()
trie.words_len = self.words_len
trie.count_words = self.count_words
trie.path = self.path
if pos in trie.dict:
trie.count_words -= trie.dict[pos].count_words
trie.dict.pop(pos)
return trie
def merge (self, other):
trie = Trie(self.path)
trie.words = None
trie.words_len = self.words_len + other.words_len
trie.count_words = self.count_words + other.count_words
trie.path = self.path
trie.dict = self.dict.copy()
for pos, subtrie in other.dict.items():
if pos in trie.dict:
trie.dict[pos] = trie.dict[pos].merge(subtrie)
else:
trie.dict[pos] = subtrie
return trie
def __repr__ (self):
answer = self.path + ":\n words_len: " + str(self.words_len) + "\n count_words:" + str(self.count_words) + " \n dict:\n"
def indent (string):
p = re.compile("^", re.M)
return p.sub(" ", string)
for pos in sorted(self.dict.keys()):
subtrie = self.dict[pos]
answer = answer + indent(str(pos) + ":\n" + subtrie.__repr__())
return answer
base_trie = Trie('')
for word in words:
base_trie.add_word(word)
base_tries = [base_trie]
last_trie = base_trie
for i in range(len(KEYS)):
subtrie = last_trie.dict[i]
last_trie = last_trie.remove(i).merge(subtrie)
base_tries.append(last_trie)
#print(base_tries[1].__repr__())
def best_solution (size):
def solve (subset, pos, best, partial):
found = sum(x[0] for x in partial)
upper_bound = sum(x[1] for x in partial)
if size <= len(subset) or upper_bound < best or len(KEYS) <= pos:
return (found, subset)
if best < found:
best = found
# Figure out our next calculations.
partial_include = []
partial_exclude = []
finalized_found = 0
for this_found, this_bound, this_trie in partial:
if this_trie is None:
# This is a generic record of already emptied tries
finalized_found += this_found
elif pos in this_trie.dict:
include_trie = this_trie.dict[pos]
partial_include.append((
this_found + include_trie.words_len,
include_trie.count_words + this_found,
include_trie
))
# We included the tally of found words in the previous partial.
# So do not double-count by including it again
partial_include.append((
0,
this_bound - include_trie.count_words - this_found,
this_trie
))
partial_exclude.append((
this_found,
this_bound - include_trie.count_words,
this_trie
))
elif this_found == this_bound:
finalized_found += this_found
else:
partial_include.append((
this_found,
this_bound,
this_trie
))
partial_exclude.append((
this_found,
this_bound,
this_trie
))
if 0 < finalized_found:
partial_include.append(
(finalized_found, finalized_found, None)
)
partial_exclude.append(
(finalized_found, finalized_found, None)
)
found_include, subset_include = solve(subset + [pos], pos+1, best, partial_include)
if best < found_include:
best = found_include
found_exclude, subset_exclude = solve(subset, pos+1, best, partial_exclude)
if found_include < found_exclude:
return (found_exclude, subset_exclude)
else:
return (found_include, subset_include)
best_count = 0
best_subset = []
start_subset = [i for i in range(size)]
while 0 < len(start_subset):
trie = base_tries[len(start_subset)].remove(start_subset[-1]+1)
count, subset = solve(list(start_subset), start_subset[-1]+2, best_count, [(trie.words_len, trie.count_words, trie)])
if best_count < count:
best_count = count
best_subset = subset
start_subset.pop()
return ''.join([KEYS[x][0] for x in best_subset])
# End code by btilly
for i in range(25, len(KEYS)):
start = datetime.datetime.now()
btilly = best_solution(i)
print btilly
print(datetime.datetime.now() - start)
start = datetime.datetime.now()
result = f(words, i)
formatted = format(get_letters(result["bits"]))
print formatted
print(datetime.datetime.now() - start)
print are_equal(btilly, formatted)
print ""