0

I would like tot identify nodes via text() that contain text with "Umlaute".

library(xml2)
library(rvest)
doc <- "<p>Über uns </p>" %>% xml2::read_html()
grepl(pattern = "Über uns", x = as.character(doc))
grepl(pattern = "Über uns", x = doc)

Questions:

How can I extract the node containing the text "Über uns"?

what tried:

https://forum.fhem.de/index.php?topic=96254.0

Java XPath umlaut/vowel parsing

# does not work
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)

# does not work    
xp <- paste0("//*[translate(text(), 'Ü', 'U') = 'Uber uns']")
html_nodes(x = doc, xpath = xp)

# does not work
xp <- paste0("//*[contains(text(), '&Uuml;ber uns')]")
html_nodes(x = doc, xpath = xp)


# this works but i wonder if there is a solution with xpath
doc2 <- doc %>% 
  as.character() %>% 
  gsub(pattern = "Ü", replacement = "Ue") %>% 
  xml2::read_html()

xp <- paste0("//*[contains(text(), 'Ueber uns')]")
html_nodes(x = doc2, xpath = xp)
Tlatwork
  • 1,445
  • 12
  • 35

1 Answers1

1

This sounds like an encoding problem; it works here with en_US.UTF-8. Maybe change your default text encoding to UTF-8 (e.g. in RStudio: Tools - Global Options - Code - Saving - Default Text Encoding) or temporarily switch:

library(xml2)
library(rvest)
old.locale <- Sys.getlocale("LC_CTYPE")
Sys.setlocale("LC_CTYPE", 'C') # using non-UTF-8 encoding
#> [1] "C"
doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
{xml_nodeset (0)}

Sys.setlocale("LC_CTYPE", 'en_US.UTF-8') # using UTF-8 encoding
#> [1] "en_US.UTF-8"

doc <- "<p>Über uns </p>" %>% xml2::read_html()
xp <- paste0("//*[contains(text(), 'Über uns')]")
html_nodes(x = doc, xpath = xp)
#> {xml_nodeset (1)}
#> [1] <p>Über uns </p>

Sys.setlocale("LC_CTYPE", old.locale)
#> [1] "en_US.UTF-8"
user12728748
  • 8,106
  • 2
  • 9
  • 14