0

I am trying to replace multiple non-ascii characters with an empty space or just remove characters in T-SQL.

'øsmeøø' string should be replaced be 'sme'

I have tried using the following

 SELECT STUFF('smeøø string',PATINDEX('%[' + CHAR(127)+ '-' +CHAR(255)+']%
 'COLLATE Latin1_General_100_BIN2,'smeøø string'),1,'')

This returns the following but I want it to continue till Patindex is 0

 smeø string

http://www.sqlfiddle.com/#!18/9eecb/41689

Learn AspNet
  • 1,192
  • 3
  • 34
  • 74

1 Answers1

0

UPDATED ON 20190419 TO DEMO A SOLUTION THAT DOES NOT REQUIRE YOU TO CREATE AN NGrams8K FUNCTION

First grab a copy of NGrams8K

Remove "bad" characters:

DECLARE @string VARCHAR(1000) = 'øsmøeøø' COLLATE Latin1_General_100_BIN2;

SELECT 
(
  SELECT    ng.token+''
  FROM      dbo.ngrams8k(@string,1) AS ng
  WHERE     ASCII(ng.token) < 127
  ORDER BY ng.position
  FOR XML PATH(''), TYPE
).value('(text())[1]', 'VARCHAR(8000)');

Replace "bad" characters with spaces:

SELECT 
(
  SELECT    CASE WHEN ASCII(ng.token) < 127 THEN ng.token ELSE ' ' END+''
  FROM      dbo.ngrams8k(@string,1) AS ng
  ORDER BY ng.position
  FOR XML PATH(''), TYPE
).value('(text())[1]', 'VARCHAR(8000)');

... and for the 2017++ version if that's what you're running and you want cleaner code:

-- Remove bad characters
SELECT STRING_AGG(ng.token,'') WITHIN GROUP (ORDER BY ng.position)
FROM   dbo.ngrams8k(@string,1) AS ng
WHERE  ASCII(ng.token) < 127;

-- Replace bad characters
SELECT STRING_AGG(IIF(ASCII(ng.token) < 127,ng.token,' '),'') WITHIN GROUP (ORDER BY ng.position)
FROM   dbo.ngrams8k(@string,1) AS ng;

UPDATED SOLUTIONS USING NGrams8K LOGIC TRANSFORMED INTO A SUBQUERY

Note my comments inside the query...

DECLARE @string VARCHAR(1000) = 'øsmøeøø' COLLATE Latin1_General_100_BIN2, @N INT = 1;

-- Remove bad characters
SELECT 
(
  SELECT    ng.token+''
  FROM      
  (
    SELECT Position = N,
           Token    = SUBSTRING(@string,CAST(N AS int),@N)
    FROM
    (
      SELECT TOP(ABS(CONVERT(BIGINT,(DATALENGTH(ISNULL(@string,''))-(ISNULL(@N,1)-1)),0)))
        ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
      FROM 
      (  SELECT 1 FROM (VALUES    -- 90 "dummy" values used to create the CTE Tally Table
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L1(x),
      (  SELECT 1 FROM (VALUES    -- 90*90=8100, enough for varchar(8000)
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L2(x) 
    ) AS iTally(N)
    WHERE @N > 0 AND @N <= DATALENGTH(@string)
  ) AS ng -- dbo.NGrams8K as an inline function
  WHERE     ASCII(ng.token) < 127
  ORDER BY ng.position
  FOR XML PATH(''), TYPE
).value('(text())[1]', 'VARCHAR(8000)');

-- Replace bad characters 
SELECT 
(
  SELECT    CASE WHEN ASCII(ng.token) < 127 THEN ng.token ELSE ' ' END+''
  FROM   --dbo.ngrams8k(@string,1) AS ng
  (
    SELECT Position = N,
           Token    = SUBSTRING(@string,CAST(N AS int),@N)
    FROM
    (
      SELECT TOP(ABS(CONVERT(BIGINT,(DATALENGTH(ISNULL(@string,''))-(ISNULL(@N,1)-1)),0)))
        ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
      FROM 
      (  SELECT 1 FROM (VALUES    -- 90 "dummy" values used to create the CTE Tally Table
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L1(x),
      (  SELECT 1 FROM (VALUES    -- 90*90=8100, enough for varchar(8000)
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
            (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L2(x) 
    ) AS iTally(N)
    WHERE @N > 0 AND @N <= DATALENGTH(@string)
  ) AS ng -- dbo.NGrams8K as an inline function
  ORDER BY ng.position
  FOR XML PATH(''), TYPE
).value('(text())[1]', 'VARCHAR(8000)');

-- Remove bad characters using STRING_AGG (SQL 2017++)
SELECT STRING_AGG(ng.token,'') WITHIN GROUP (ORDER BY ng.position)
FROM   --dbo.ngrams8k(@string,1) AS ng
(
  SELECT Position = N,
         Token    = SUBSTRING(@string,CAST(N AS int),@N)
  FROM
  (
    SELECT TOP(ABS(CONVERT(BIGINT,(DATALENGTH(ISNULL(@string,''))-(ISNULL(@N,1)-1)),0)))
      ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
    FROM 
    (  SELECT 1 FROM (VALUES    -- 90 "dummy" values used to create the CTE Tally Table
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L1(x),
    (  SELECT 1 FROM (VALUES    -- 90*90=8100, enough for varchar(8000)
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L2(x) 
  ) AS iTally(N)
  WHERE @N > 0 AND @N <= DATALENGTH(@string)
) AS ng -- dbo.NGrams8K as an inline function
WHERE  ASCII(ng.token) < 127;

-- Replace bad characters using STRING_AGG (SQL 2017++)
SELECT STRING_AGG(IIF(ASCII(ng.token) < 127,ng.token,' '),'') WITHIN GROUP (ORDER BY ng.position)
FROM   --dbo.ngrams8k(@string,1) AS ng
(
  SELECT Position = N,
         Token    = SUBSTRING(@string,CAST(N AS int),@N)
  FROM
  (
    SELECT TOP(ABS(CONVERT(BIGINT,(DATALENGTH(ISNULL(@string,''))-(ISNULL(@N,1)-1)),0)))
      ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
    FROM 
    (  SELECT 1 FROM (VALUES    -- 90 "dummy" values used to create the CTE Tally Table
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L1(x),
    (  SELECT 1 FROM (VALUES    -- 90*90=8100, enough for varchar(8000)
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),
          (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(N)) AS L2(x) 
  ) AS iTally(N)
  WHERE @N > 0 AND @N <= DATALENGTH(@string)
) AS ng; -- dbo.NGrams8K as an inline function
Alan Burstein
  • 7,770
  • 1
  • 15
  • 18
  • Please explain your answer and I cannot grab a copy of NGrams8K. I have to use it somewhere on production data – Learn AspNet Apr 19 '19 at 13:26
  • NGrams8K takes a string and returns a table with one row for each token; even if you cannot create the function you can still read reference the article to see how the function works. I updated my answer to show how to add the logic inline so that you don't need to create dbo.ngrams8k in Production. – Alan Burstein Apr 19 '19 at 18:23