For the purposes of the reprex I've generated a tibble called random_DNA_tbl that is a random selection of 10 DNA sequences (of 100 bases). I've got a separate tibble called subseq_tbl, with 3 shorter sequences that match 100% to 3 of the sequences in random_DNA_tbl, but I'd also like to use fuzzy matching of sequences from subseq_tbl to other sequences in random_DNA_tbl. I was hopping to be able to use the fuzzyjoin package stringdist_XX_join functions, however these don't seem to work, even though the subseq sequences are actually perfect matches and do work with other matching functions, e.g. regex_XX_join.
library(tidyverse)
library(fuzzjoin)
random_DNA_tbl <- structure(list(random_name = c("random_seq_1", "random_seq_2",
"random_seq_3", "random_seq_4", "random_seq_5", "random_seq_6",
"random_seq_7", "random_seq_8", "random_seq_9", "random_seq_10"
), random_seq = c("CTCCAGTATTAGTCAATGATAAGGGCGAAGGAGCAGTTCTGATATCTCTGTGAAGTAGCATGCGTCTGACTCTCGGGCGCGGCGGAAGACCGAGGAGCGC",
"TTTTCGTCCGACAGAACATCATATAAACTCGATTTAATCTTCTTTTCAAAATCAATTCGAGGGCACCCGATGCGCGTACTGTCAACCATCAAGATAACGA",
"GAATAGTGTACCAGGTCTTATAGTATGTTCATTCGTACAAAAGGATCCAAAACCAATAGGAACCGCTTCTCCCAACAAGCCTGCTCCTTGCAGAGTGAGT",
"GTGACGCCAGATTCTTGACCTGAACCCAGTTCTACCCCCCCAAAACGATCTGGCTTCCGCTCTCTAATGACAGCTATATTGCTTGATAGAGATCGGTAGG",
"ACCGCCTTCCGTAGGTGAACAACCAGCCTCCTGCGGCCAGGGAAGAAGTCGTGGCCTTGGTTAATTTTGGGTTACTAAACGGACACCCACCGTGGCTCAC",
"ACGACTATCAAGACAACTTGTCTCAGAGCTTCACGCACCAACCCCTAACCCAGCAACTCCAGGGCATTGCCACTCTATGATTCGGCGCGGGTGCGCCCTC",
"GGTAGCACTGAGATCAGCCACTATCAAGGTGCTCCTCACTTCTGGTTCTCAGGTTGCGGGCCGATCATTTTTCTCCGAATTAGCGGTCTTTCACGTCAGA",
"CACTGAATAGTCAGCGTAAAGGCGTCAATCTGTCAGCTCGACGGCAGAAGATGTCCAGCGTGCAGTTTCATAGGCGCCCCGGGGAACCTTCTGTGAGAAT",
"GCCTCTTAATTCTTGAACCGCGAGAGGACACAGTGAGATCTGTTCCATTTCCCCCGTTGCCCGCATGGATCGCCCAGACTCTAGACTTAGTGTGACCTTT",
"CGGTATCGGATTGGTCTACGAATCCGCGACCCTCAAGGTTATTTCTGGATGGAGTTCCGTGCTCGCCTGGATGCACTGCCCAAGCAATTAGGACGAAGTA"
)), .Names = c("random_name", "random_seq"), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
subseq_tbl <- structure(list(subseq_name = c("subseq1", "subseq2", "subseq3"
), subseq = c("TCAACCATCAAGATAAC", "TAGCGGTCTTTCACGT", "AAGGATCC"
)), .Names = c("subseq_name", "subseq"), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame"))
Doesn't work:
stringdist_left_join(random_DNA_tbl, subseq_tbl, by = c(random_seq = "subseq"))
Does work:
regex_left_join(random_DNA_tbl, subseq_tbl, by = c(random_seq = "subseq"))
I've tried tweaking the max_dist parameter in stringdist but to no avail. Can anyone shed any light on the problem please?