1

I want to return integers from rather complex strings which combined unicode characters such as - and . with characters and integers.

I've come a long way in achieving this, but I still have troubles with some strings of a more complex structure. For instance:

DECLARE @Tabl as table
(
   dats nvarchar(15)
)

INSERT INTO @Tabl VALUES
('103-P705hh'),
('115-xxx-44'),
('103-705.13'),
('525-hheef4')

select LEFT(SUBSTRING(REPLACE(REPLACE(dats, '.',''),'-',''), PATINDEX('%[0-9.-]%', REPLACE(REPLACE(dats, '.',''),'-','')), 8000),
       PATINDEX('%[^0-9.-]%', SUBSTRING(REPLACE(REPLACE(dats, '.',''),'-',''), PATINDEX('%[0-9.-]%', REPLACE(REPLACE(dats, '.',''),'-','')), 8000) + 'X')-1)
from @tabl

Gives

Raw Input          Actual return:          Desired return:
103-P705hh         103                     103705
115-xxx-44         115                     11544
103-705.13         10370513                10370513
525-hheef4         525                     5254

I had a topic regarding this yesterday to cover the case when multiple - or . are present, but as seen in the return this is actually taken care of now. However, expanding the databases I work with I encountered much more complex string such as those I presented here.

Does anyone have any idea what to do when characters and integers are "mixed up" in the string?

Regards, Cenderze

Cenderze
  • 1,202
  • 5
  • 33
  • 56
  • Possible duplicate of [Query to get only numbers from a string](http://stackoverflow.com/questions/16667251/query-to-get-only-numbers-from-a-string) – Peter B Feb 15 '17 at 14:29

4 Answers4

4

I have seen loads of solutions that use a scalar udf with a loop, but I don't like either of these things, so throwing my hat into the ring with a different approach.

With the help of a numbers table you can deconstruct each value into its individual characters, remove non-numeric characters, then reconstruct it using FOR XML to concatenate rows, e.g.

WITH Numbers (Number) AS
(   SELECT  ROW_NUMBER() OVER(ORDER BY N1.N)
    FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N)         -- 100
    CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N)   -- 100
    CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N)   -- 1,000
    --CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N)   -- 10,000
    --CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N5 (N) -- 100,000

    --COMMENT OR UNCOMMENT ROWS AS NECESSARY DEPENDING ON YOU MAX STRING LENGTH
)
SELECT  t.dats,
        Stripped = x.data.value('.', 'INT')
FROM    @tabl AS t
        CROSS APPLY
        (   SELECT  SUBSTRING(t.dats, n.Number, 1)
            FROM    Numbers n
            WHERE   n.Number <= LEN(t.dats)
            AND     SUBSTRING(t.dats, n.Number, 1) LIKE '[0-9]'
            ORDER BY n.Number
            FOR XML PATH(''), TYPE
        ) x (data);

Gives:

dats            Stripped
----------------------
103-P705hh      103705
115-xxx-44      11544
103-705.13      10370513
525-hheef4      5254

I haven't done any testing so it could be that the added overhead of expanding each string into individual characters and reconstructing it is actually a lot more overhead than than a UDF with a loop.


I decided to bench mark this

1. Set up functions

CREATE FUNCTION dbo.ExtractNumeric_TVF (@Input VARCHAR(8000))
RETURNS TABLE
AS
RETURN
(   WITH Numbers (Number) AS
    (   SELECT TOP (LEN(@Input)) ROW_NUMBER() OVER(ORDER BY N1.N)
        FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N)         -- 100
        CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N)   -- 100
        CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N)   -- 1,000
        CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N)   -- 10,000
    )
    SELECT  Stripped = x.data.value('.', 'VARCHAR(MAX)')
    FROM    (   SELECT  SUBSTRING(@Input, n.Number, 1)
                FROM    Numbers n
                WHERE   n.Number <= LEN(@Input)
                AND     SUBSTRING(@Input, n.Number, 1) LIKE '[0-9]'
                ORDER BY n.Number
                FOR XML PATH(''), TYPE
            ) x (data)
);
GO
create function dbo.ExtractNumeric_UDF(@s varchar(8000))
returns varchar(8000)
as
begin
    declare @out varchar(max) = ''
    declare @c char(1)
    while len(@s) > 0 begin
        set @c = left(@s,1)
        if @c like '[0123456789]' set @out += @c
        set @s = substring(@s, 2, len(@s) -1)
    end
    return @out
end
GO

2. Create first set of sample data and log table

CREATE TABLE dbo.T (Value VARCHAR(8000) NOT NULL);
INSERT dbo.T (Value)
SELECT  TOP 1000 LEFT(NEWID(), CEILING(RAND(CHECKSUM(NEWID())) * 36))
FROM    sys.all_objects a
CROSS JOIN sys.all_objects b;

CREATE TABLE dbo.TestLog (Fx VARCHAR(255), NumberOfRows INT, TimeStart DATETIME2(7), TimeEnd DATETIME2(7))

3. Run Tests

GO
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_UDF', 1000, SYSDATETIME());

INSERT @T (Val)
SELECT  dbo.ExtractNumeric_UDF(Value)
FROM    dbo.T;

UPDATE  dbo.TestLog
SET     TimeEnd = SYSDATETIME()
WHERE   TimeEnd IS NULL;

GO 100
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_TVF', 1000, SYSDATETIME());

INSERT @T (Val)
SELECT  f.Stripped
FROM    dbo.T
        CROSS APPLY dbo.ExtractNumeric_TVF(Value) f;

UPDATE  dbo.TestLog
SET     TimeEnd = SYSDATETIME()
WHERE   TimeEnd IS NULL;

GO 100

4. Get Results

SELECT  Fx,
        NumberOfRows,
        RunTime = AVG(DATEDIFF(MILLISECOND, TimeStart, TimeEnd))
FROM    dbo.TestLog
GROUP BY fx, NumberOfRows;

I did the following (using just NEWID() so only a maximum of 36 characters) over 1,000 and 10,000 rows, the results were:

Fx                          NumberOfRows    RunTime
--------------------------------------------------------
dbo.ExtractNumeric_TVF      1000            31
dbo.ExtractNumeric_UDF      1000            56
dbo.ExtractNumeric_TVF      10000           280
dbo.ExtractNumeric_UDF      10000           510

So the TVF coming in at just under half the time of the UDF.

I wanted to test edge cases so put 1,000 rows of longer strings (5,400 characters)

TRUNCATE TABLE dbo.T;
INSERT dbo.T (Value)
SELECT  TOP 1000 
        REPLICATE(CONCAT(NEWID(), NEWID(), NEWID(), NEWID(), NEWID()), 30)
FROM    sys.all_objects a
CROSS JOIN sys.all_objects b;

And this is where the TVF came into its own, running over 5x faster:

Fx                      NumberOfRows    RunTime 
------------------------------------------------
dbo.ExtractNumeric_TVF  1000            2485    
dbo.ExtractNumeric_UDF  1000            12955   
GarethD
  • 68,045
  • 10
  • 83
  • 123
  • This works like a charm (and fast too, 52 000 records in 1 sec, I'd assume it would take somewhat longer than that). – Cenderze Feb 15 '17 at 15:23
  • I added my solution to your test and while it also totally crushed the looping, scalar function approaches it was definitely not as performant as this solution. – Sean Lange Feb 15 '17 at 16:26
  • @GarethD So I tried to create your UDF by copying it into a new Query and ran it. Then I used Select f.stripped from myDatabase t cross apply dbo.ExtractNumeric_TVF(t.articlenr) f but this does not work (Invalid column name articlenr). Any possibility I can get some help with this? I want to use all integer strings on joins etc. so the Queries got so big without using an UDF (which I am new at). – Cenderze Feb 16 '17 at 08:21
  • 1
    The problem of an invalid column name is nothing to do with the function, it just means that you don't have a column called `articlenr` in your table. Can you post your full query and table structure, and sample data to a new question and I will take a look. – GarethD Feb 16 '17 at 08:30
  • @GarethD Thanks! I figured it out now. Now it works even as a function! – Cenderze Feb 16 '17 at 08:38
  • There's a pretty good write-up on a similar solution [here](http://www.sqlservercentral.com/articles/String+Manipulation/94365/). Specifically the `PatternSplitCM` version of the code. – Erik Darling Nov 25 '17 at 15:09
2

I also really don't like the looping solutions so I decided to try my hand at one. This is using a predefined tally table but is quite similar to others posted here already.

This is my tally table. I keep this as a view on my system.

create View [dbo].[cteTally] as

WITH
    E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n)),
    E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
    E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
    cteTally(N) AS 
    (
        SELECT  ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
    )
select N from cteTally
GO

Because I don't like looping I decided to use the table valued function approach which let me reuse this functionality in other queries with little to no effort. Here is one way to write such a function.

create function GetOnlyNumbers
(
    @SearchVal varchar(8000)
) returns table as return

    with MyValues as
    (
        select substring(@SearchVal, N, 1) as number
            , t.N
        from cteTally t 
        where N <= len(@SearchVal)
            and substring(@SearchVal, N, 1) like '[0-9]'
    )

    select distinct NumValue = STUFF((select number + ''
                from MyValues mv2
                order by mv2.N
                for xml path('')), 1, 0, '')
    from MyValues mv

That looks good but the proof is in the pudding. Let's take this out with our sample data and kick the tires a few times.

DECLARE @Tabl as table
(
   dats nvarchar(15)
)

INSERT INTO @Tabl VALUES
('103-P705hh'),
('115-xxx-44'),
('103-705.13'),
('525-hheef4')


select *
from @Tabl t
cross apply dbo.GetOnlyNumbers(t.dats) x

Sure looks nice and tidy. I tested against several of the other solutions posted here and without going into deep testing this appears to be significantly faster than the other approaches posted at this time.

Sean Lange
  • 33,028
  • 3
  • 25
  • 40
1

Can you use a udf ? If so, try this

create alter function numerals(@s varchar(max))
returns varchar(max)
as
begin
    declare @out varchar(max) = ''
    declare @c char(1)
    while len(@s) > 0 begin
        set @c = left(@s,1)
        if @c like '[0123456789]' set @out += @c
        set @s = substring(@s, 2, len(@s) -1)
    end
    return @out
end

to use it on your temp table...

select dbo.numerals(dats) from @Tabl

another solution, that does not use a UDF, but will work only if your table has a primary key, uses a recursive CTE. It is:

DECLARE @Tabl as table
(pk int identity not null,  -- <=== added a primary key
 dats nvarchar(max) )

INSERT INTO @Tabl VALUES
  ('103-P705hh'),
  ('115-xxx-44'),
  ('103-705.13'),
  ('525-hheef4');

 with newVals(pk, pos, newD) as 
     (select pk, 1, 
         case when left(Dats,1) like '[0123456789]'  
              then left(Dats,1) else '' end
     from @tabl
     Union All
     Select t.pk, pos + 1, n.newD +
        case when substring(dats, pos+1, 1) like '[0123456789]'   
             then substring(dats, pos+1, 1) else '' end          
     from @tabl t join newVals n on n.pk = t.pk
     where pos+1 <= len(dats) )         
     Select newD from newVals x         
     where pos = (Select Max(pos)
                  from newVals 
                  where pk = x.pk)
Charles Bretana
  • 143,358
  • 22
  • 150
  • 216
1
DECLARE @Tabl as table
(
   ID   INT,
   dats nvarchar(15)
)

INSERT INTO @Tabl VALUES
(1, '103-P705hh'),
(2, '115-xxx-44'),
(3, '103-705.13'),
(4, '525-hheef4')


SELECT T.ID, t.dats
,(
  SELECT SUBSTRING(tt.dats,V.number,1)
  FROM @Tabl tt
    JOIN master.dbo.spt_values V ON V.type='P' AND V.number BETWEEN 1 AND LEN(tt.dats)
  WHERE tt.ID=T.ID AND SUBSTRING(TT.dats,V.number,1) LIKE '[0-9]'
  ORDER BY V.number
  FOR XML PATH('')
 ) S
FROM @Tabl t
ORDER BY T.ID;
Denis Rubashkin
  • 2,151
  • 1
  • 9
  • 14