I'm looking for a method that would sanitize search terms passed to elastic search, i.e. escape all the control characters. Something like what is described in Ruby in this answer. Is there such a thing for Scala?
Asked
Active
Viewed 750 times
2 Answers
2
I've translated the solution for ruby found in this answer to Scala:
package util
import java.util.regex.Pattern
trait ElasticSearchSanitizer {
/** Sanitizes special characters and set operators in elastic search search-terms. */
def sanitize(term: String): String = (
escapeSpecialCharacters _ andThen
escapeSetOperators andThen
collapseWhiteSpaces andThen
escapeOddQuote
)(term)
private def escapeSpecialCharacters(term: String): String = {
val escapedCharacters = Pattern.quote("""\/+-&|!(){}[]^~*?:""")
term.replaceAll(s"([$escapedCharacters])", "\\\\$1")
}
private def escapeSetOperators(term: String): String = {
val operators = Set("AND", "OR", "NOT")
operators.foldLeft(term) { case (accTerm, op) =>
val escapedOp = escapeEachCharacter(op)
accTerm.replaceAll(s"""\\b($op)\\b""", escapedOp)
}
}
private def escapeEachCharacter(op: String): String =
op.toCharArray.map(ch => s"""\\\\$ch""").mkString
private def collapseWhiteSpaces(term: String): String = term.replaceAll("""\s+""", " ")
private def escapeOddQuote(term: String): String = {
if (term.count(_ == '"') % 2 == 1) term.replaceAll("""(.*)"(.*)""", """$1\\"$2""") else term
}
}
And here are the tests:
package util
import org.specs2.matcher.Matchers
import org.specs2.mutable.Specification
class ElasticSearchSanitizerSpec extends Specification with Matchers {
"sanitize" should {
object S extends ElasticSearchSanitizer
"escape special characters" in {
S.sanitize("""back\slash""") mustEqual """back\\slash"""
S.sanitize("""sl/ash""") mustEqual """sl\/ash"""
S.sanitize("""pl+us""") mustEqual """pl\+us"""
S.sanitize("""mi-nus""") mustEqual """mi\-nus"""
S.sanitize("""amper&sand""") mustEqual """amper\&sand"""
S.sanitize("""pi|pe""") mustEqual """pi\|pe"""
S.sanitize("""ba!ng""") mustEqual """ba\!ng"""
S.sanitize("""open(parenthesis""") mustEqual """open\(parenthesis"""
S.sanitize("""close)parenthesis""") mustEqual """close\)parenthesis"""
S.sanitize("""open{curly""") mustEqual """open\{curly"""
S.sanitize("""close}curly""") mustEqual """close\}curly"""
S.sanitize("""open[bracket""") mustEqual """open\[bracket"""
S.sanitize("""close[bracket""") mustEqual """close\[bracket"""
S.sanitize("""circum^flex""") mustEqual """circum\^flex"""
S.sanitize("""til~de""") mustEqual """til\~de"""
S.sanitize("""aste*risk""") mustEqual """aste\*risk"""
S.sanitize("""ques?tion""") mustEqual """ques\?tion"""
S.sanitize("""co:lon""") mustEqual """co\:lon"""
}
"escape set operators" in {
S.sanitize("gin AND tonic") mustEqual """gin \A\N\D tonic"""
S.sanitize("now OR never") mustEqual """now \O\R never"""
S.sanitize("NOT never") mustEqual """\N\O\T never"""
}
"not escape set operators if part of words" in {
S.sanitize("MANDATE") mustEqual "MANDATE"
S.sanitize("NOTORIOUS") mustEqual "NOTORIOUS"
}
"not escape set operators if lowercase" in {
S.sanitize("and or not") mustEqual "and or not"
}
"collapse excess whitespaces" in {
S.sanitize("Y u no use single \t space??") mustEqual """Y u no use single space\?\?"""
}
"escape last quote if number of quotes is odd" in {
S.sanitize("""Che "Guevarra" wears me" on his t shirt""") mustEqual """Che "Guevarra" wears me\" on his t shirt"""
}
"not escape any quotes if number of quotes even" in {
S.sanitize("""Using these "lasers", we punch a hole in the "ozone layer"... """) mustEqual
"""Using these "lasers", we punch a hole in the "ozone layer"... """
}
}
}
2
There are provided libs for this that you're probably better off using
import org.apache.lucene.queryparser.classic.QueryParserBase
....
val escapedQ = QueryParserBase.escape(rawQuery)

Oliver Shaw
- 5,235
- 4
- 26
- 35