I have been working on this DNA problem for a while now, and I do not understand what causes the many problems. Right now there are just some technical problems, for example saying "No match" instead of "Lavender", but I believe there is no problem when reading the files in the folder "sequences". In check50, there is a traceback error:
Traceback (most recent call last):
File "/tmp/tmp7j4mzmff/test2/dna.py", line 83, in <module>
main()
File "/tmp/tmp7j4mzmff/test2/dna.py", line 18, in main
os.chdir(path)
FileNotFoundError: [Errno 2] No such file or directory: '/workspaces/9...
Can you help?
import csv
import sys
import os
def main():
if len(sys.argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
sys.exit(1)
database = []
with open(sys.argv[1], "r") as file:
reader = csv.DictReader(file)
for row in reader:
database.append(row)
path = "/workspaces/90389241/dna/sequences"
os.chdir(path)
for file in os.listdir():
with open(file, 'r') as file:
sequence = file.read()
# TODO: Find longest match of each STR in DNA sequence
subsequences = list(database[0].keys())[1:]
finals = {}
for subsequence in subsequences:
finals[subsequence] = longest_match(sequence, subsequence)
# TODO: Check database for matching profiles
for someone in database:
count2 = 0
for subsequence in subsequences:
if (int(someone[subsequence]) == finals[subsequence]):
count2 += 1
if count2 == len(subsequences):
print(someone["name"])
return
print("No match")
return
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()
My first given error was, and I quote, "KeyError: 0" for "subsequences = list(database[0].keys())[1:]
", but I don't think there is a problem with accessing the data inside of "database". I had used
if len(sys.argv[2]) == "databases/small.csv":
with open("small.csv", "r") as file:
reader = csv.DictReader(file)
for row in reader:
database.append(row)
elif len(sys.argv[2]) == "databases/large.csv":
with open("large.csv", "r") as file:
reader = csv.DictReader(file)
for row in reader:
database.append(row)
instead of what I used above.