I'm iterating through a string to convert it into a different data structure; that being a dictionary of objects, where the key is the k-length substring at the current position in the main string and the value is an object which counts the occurrences of that substring within "frames" of the main string (i.e., L-length substrings). I only keep a count up to a certain threshold, once that has passed, that key gets added to a bucket which is ultimately returned.
class kmerRecord:
_inFrames = {}
_threshold = None
reachedThreshold = False
def __init__(self, threshold, inFrames = []):
self._threshold = threshold
if inFrames:
self.incrementFrames(inFrames)
# Debug
def __repr__(self):
return '<in: ' + str(self._inFrames) + ', t: ' + str(self.reachedThreshold) + '>'
def incrementFrames(self, inFrames):
if not self.reachedThreshold:
for f in inFrames:
if f in self._inFrames:
self._inFrames[f] += 1
if self._inFrames[f] >= self._threshold:
self.reachedThreshold = True
break
else:
self._inFrames[f] = 1
def kmerClump(data, k, L, t):
kmers = {}
found = []
N = len(data)
for i in range(N - k + 1):
kmer = data[i : i + k]
inFrames = range(max(0, i - L + k), min(i, N - L) + 1)
if kmer in kmers:
kmers[kmer].incrementFrames(inFrames)
else:
kmers[kmer] = kmerRecord(t, inFrames)
# Debug
# print kmer, min(inFrames), max(inFrames), kmers
if kmers[kmer].reachedThreshold:
found.append(kmer)
return found
As far as I can see, my code is doing what I've described above. However, it doesn't work and when I try outputting the dictionary and objects, it appears as though every object in the dictionary is the same: As though there's actually only one object and the other key values are just references to that, rather than distinct objects. Have I got the semantics wrong, somewhere, or is there another problem?