There is a news website I frequent that has a series of headlines on their main page. Clicking the headline takes you to the individual story. I am trying to write a Powershell script that will loop through all the headlines on the main page and write each story to a text file.
The problem I am having is the stories are in Spanish and the Spanish characters with accent marks do not show up properly in my text file (actually the weird thing is, sometimes they do, but the majority of the time they don't). I've checked the headers of each story and the charset is set to UTF8 so I think the web pages themselves are formatted correctly. I've tried every way I know of to set the output file as UTF8 as well, but I can't seem to get it fixed.
Anyone have any ideas? Here is the code:
$ie = New-Object -ComObject 'InternetExplorer.Application'
$url = "https://www3.nhk.or.jp/nhkworld/es/news/"
#$ie.Visible = $true
$ie.Navigate($url)
while($ie.busy) {Start-Sleep 1}
$file = "C:\temp\nhk.txt"
if(Test-Path $file) { Remove-Item $file }
$lastLink = $null
foreach($link in $ie.Document.getElementsByTagName("a")) {
if($link.href -match "\d{6}") { #the links to the stories we want are numbered with 6 digits
if(-not($link.href -eq $lastLink)) {
$uri = $link.href
$w = Invoke-WebRequest -Uri $uri
ForEach($element in $w.AllElements | where tagname -eq "p") {
$text = $element | select -expand innerText
$text = $text + "`r`n"
Add-Content -Path $file -Value $text
}
$lastLink = $link.href
}
}
}