1

The SeqAn tutorial for Pattern Matching mentions that StringSet's can serve as either haystacks or needles. In attempting to use a StringSet as the haystack as follows,

StringSet<Dna5String> seqs;

/* do stuff to load sequences into seqs */

Finder<StringSet<Dna5String> > finder(seqs);
Pattern<Dna5String, Simple> pattern(Dna5String("GAATTC"));

if (find(finder, pattern))
{
  std::cout << '[' << beginPosition(finder) << ',' << endPosition(finder)
            << ")\t" << infix(finder) << std::endl;
} else
{
  std::cout << "No match!";
}

I get the error:

error: use of overloaded operator '==' is ambiguous (with operand types 'const const seqan::String, seqan::Alloc >' and 'const seqan::SimpleType')

Anyone have an idea as to how this should be done properly?

Using a single Dna5String in the Finder works fine. The tutorial does show how to do an offline search (i.e., with indexing), but that is not what I'd like. I'd prefer not to have to manually iterate over the StringSet if the Finder-Pattern tools in SeqAn already handle it.

merv
  • 67,214
  • 13
  • 180
  • 245

1 Answers1

1

you can try,

#include <iostream>
#include <seqan/sequence.h>  // CharString, ...
#include <seqan/find.h>
#include <seqan/stream.h>

using namespace seqan;

typedef Iterator<StringSet<Dna5String> >::Type TStringSetIterator;

int main(int, char const **)
{
    StringSet<Dna5String> seqs;
    Dna5String seq1 =
        "TAGGTTTTCCGAAAAGGTAGCAACTTTACGTGATCAAACCTCTGACGGGGTTTTCCCCGTCGAAATTGGGTG"
        "TTTCTTGTCTTGTTCTCACTTGGGGCATCTCCGTCAAGCCAAGAAAGTGCTCCCTGGATTCTGTTGCTAACG"
        "AGTCTCCTCTGCATTCCTGCTTGACTGATTGGGCGGACGGGGTGTCCACCTGACGCTGAGTATCGCCGTCAC"
        "GGTGCCACATGTCTTATCTATTCAGGGATCAGAATTCATTCAGGAAATCAGGAGATGCTACACTTGGGTTAT"
        "CGAAGCTCCTTCCAAGGCGTAGCAAGGGCGACTGAGCGCGTAAGCTCTAGATCTCCTCGTGTTGCAACTACA"
        "CGCGCGGGTCACTCGAAACACATAGTATGAACTTAACGACTGCTCGTACTGAACAATGCTGAGGCAGAAGAT"
        "CGCAGACCAGGCATCCCACTGCTTGAAAAAACTATNNNNCTACCCGCCTTTTTATTATCTCATCAGATCAAG";
    Dna5String seq2 =
        "ACCGACGATTAGCTTTGTCCGAGTTACAACGGTTCAATAATACAAAGGATGGCATAAACCCATTTGTGTGAA"
        "AGTGCCCATCACATTATGATTCTGTCTACTATGGTTAATTCCCAATATACTCTCGAAAAGAGGGTATGCTCC"
        "CACGGCCATTTACGTCACTAAAAGATAAGATTGCTCAAANNNNNNNNNACTGCCAACTTGCTGGTAGCTTCA"
        "GGGGTTGTCCACAGCGGGGGGTCGTATGCCTTTGTGGTATACCTTACTAGCCGCGCCATGGTGCCTAAGAAT"
        "GAAGTAAAACAATTGATGTGAGACTCGACAGCCAGGCTTCGCGCTAAGGACGCAAAGAAATTCCCTACATCA"
        "GACGGCCGCGNNNAACGATGCTATCGGTTAGGACATTGTGCCCTAGTATGTACATGCCTAATACAATTGGAT"
        "CAAACGTTATTCCCACACACGGGTAGAAGAACNNNNATTACCCGTAGGCACTCCCCGATTCAAGTAGCCGCG";

    clear(seqs);
    appendValue(seqs, seq1);
    appendValue(seqs, seq2);

    Pattern<Dna5String, Simple> pattern(Dna5String("GAATTC"));

    //For each sequence in seqs
    for (TStringSetIterator it = begin(seqs); it != end(seqs); ++it)
    {
        std::cout << *it << std::endl;
        //I create a finder for each sequence in seqs
        Finder<Dna5String> finder(*it);
        if (find(finder, pattern)){
            std::cout << '[' << beginPosition(finder) << ',' << endPosition(finder)
                      << ")\t" << infix(finder) << std::endl;
        }else{
            std::cout << "No match!" << std::endl;
        }
    }
    return 0;
}

you get:

TAGGTTTTCCGAAAAGGTAGCAACTTTACGTGATCAAACCTCTGACGGGGTTTTCCCCGTCGAAATTGGGTGTTTCTTGTCTTGTTCTCACTTGGGGCATCTCCGTCAAGCCAAGAAAGTGCTCCCTGGATTCTGTTGCTAACGAGTCTCCTCTGCATTCCTGCTTGACTGATTGGGCGGACGGGGTGTCCACCTGACGCTGAGTATCGCCGTCACGGTGCCACATGTCTTATCTATTCAGGGATCAGAATTCATTCAGGAAATCAGGAGATGCTACACTTGGGTTATCGAAGCTCCTTCCAAGGCGTAGCAAGGGCGACTGAGCGCGTAAGCTCTAGATCTCCTCGTGTTGCAACTACACGCGCGGGTCACTCGAAACACATAGTATGAACTTAACGACTGCTCGTACTGAACAATGCTGAGGCAGAAGATCGCAGACCAGGCATCCCACTGCTTGAAAAAACTATNNNNCTACCCGCCTTTTTATTATCTCATCAGATCAAG
[247,253)   GAATTC
ACCGACGATTAGCTTTGTCCGAGTTACAACGGTTCAATAATACAAAGGATGGCATAAACCCATTTGTGTGAAAGTGCCCATCACATTATGATTCTGTCTACTATGGTTAATTCCCAATATACTCTCGAAAAGAGGGTATGCTCCCACGGCCATTTACGTCACTAAAAGATAAGATTGCTCAAANNNNNNNNNACTGCCAACTTGCTGGTAGCTTCAGGGGTTGTCCACAGCGGGGGGTCGTATGCCTTTGTGGTATACCTTACTAGCCGCGCCATGGTGCCTAAGAATGAAGTAAAACAATTGATGTGAGACTCGACAGCCAGGCTTCGCGCTAAGGACGCAAAGAAATTCCCTACATCAGACGGCCGCGNNNAACGATGCTATCGGTTAGGACATTGTGCCCTAGTATGTACATGCCTAATACAATTGGATCAAACGTTATTCCCACACACGGGTAGAAGAACNNNNATTACCCGTAGGCACTCCCCGATTCAAGTAGCCGCG
No match!

EDIT, I hope this help you

....
#include <seqan/index.h>
....

Pattern<Dna5String> pattern(Dna5String("GAATTC"));
Index< StringSet<Dna5String > > myIndex(seqs);
Finder< Index<StringSet<Dna5String > > > finder(myIndex);
while (find(finder, pattern)){
    std::cout << '[' << beginPosition(finder) << ',' << endPosition(finder)
              << ")\t" << infix(finder) << std::endl;
}   
....

you get,

[< 0 , 247 >,< 0 , 253 >)   GAATTC
Jose Ricardo Bustos M.
  • 8,016
  • 6
  • 40
  • 62
  • As stated in question, I realize this is an option, but the library documentation makes it sound as though you don't need to manually iterate. – merv Oct 06 '15 at 01:18
  • @merv I add a alternative solution .... I had to remove `Simple` from `Pattern` declaration – Jose Ricardo Bustos M. Oct 06 '15 at 01:55
  • I appreciate your effort. However, your edit introduces indexing, which is to be avoided as per the original question. Indexing even moderately complex sequence files (e.g., transcriptomes) can take several minutes and GB's of memory, which is overkill if only searching for a few patterns. – merv Oct 06 '15 at 07:37