I'm trying to build a regex to find a word inside a table in bigquery that can contain accents.
Play with example below
so the input can be transformed in scala or using functions of bigquery
It does the whole stuff in BigQuery Standard SQL
#standardSQL
CREATE TEMP FUNCTION latin2accents(word STRING) AS
((
WITH lookups AS (
SELECT
'y,a,e,i,o,u,c,ç,n,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
'y,a,e,i,o,u,c,c,n,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
), pairs AS (
SELECT accent, latin FROM lookups,
UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1,
UNNEST(SPLIT(latins)) AS latin WITH OFFSET AS p2
WHERE p1 = p2
), map AS (
SELECT latin, CONCAT('[', STRING_AGG(accent, ''), ']') AS accents
FROM pairs
GROUP BY latin
)
SELECT CONCAT('(?i:', STRING_AGG(IFNULL(accents, char), ''), ')')
FROM UNNEST(SPLIT(word, '')) char
LEFT JOIN map
ON char = latin
));
WITH yourTable AS (
SELECT 'Séries' AS word UNION ALL SELECT 'Series' UNION ALL
SELECT 'brasília' UNION ALL SELECT 'Niño' UNION ALL SELECT 'aperçu'
), inputs AS (
SELECT 'series' AS input UNION ALL SELECT 'Brasilia' UNION ALL
SELECT 'nino' UNION ALL SELECT 'apercu'
)
SELECT input, word AS found_word
FROM yourTable CROSS JOIN inputs
WHERE REGEXP_CONTAINS(word, latin2accents(input)) = TRUE
ORDER BY input, word
The output is (which I think is what you wanted)
input found_word
----- ----------
Brasilia brasília
apercu aperçu
nino Niño
series Series
series Séries