I have seen loads of solutions that use a scalar udf with a loop, but I don't like either of these things, so throwing my hat into the ring with a different approach.
With the help of a numbers table you can deconstruct each value into its individual characters, remove non-numeric characters, then reconstruct it using FOR XML
to concatenate rows, e.g.
WITH Numbers (Number) AS
( SELECT ROW_NUMBER() OVER(ORDER BY N1.N)
FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N) -- 1,000
--CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N) -- 10,000
--CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N5 (N) -- 100,000
--COMMENT OR UNCOMMENT ROWS AS NECESSARY DEPENDING ON YOU MAX STRING LENGTH
)
SELECT t.dats,
Stripped = x.data.value('.', 'INT')
FROM @tabl AS t
CROSS APPLY
( SELECT SUBSTRING(t.dats, n.Number, 1)
FROM Numbers n
WHERE n.Number <= LEN(t.dats)
AND SUBSTRING(t.dats, n.Number, 1) LIKE '[0-9]'
ORDER BY n.Number
FOR XML PATH(''), TYPE
) x (data);
Gives:
dats Stripped
----------------------
103-P705hh 103705
115-xxx-44 11544
103-705.13 10370513
525-hheef4 5254
I haven't done any testing so it could be that the added overhead of expanding each string into individual characters and reconstructing it is actually a lot more overhead than than a UDF with a loop.
I decided to bench mark this
1. Set up functions
CREATE FUNCTION dbo.ExtractNumeric_TVF (@Input VARCHAR(8000))
RETURNS TABLE
AS
RETURN
( WITH Numbers (Number) AS
( SELECT TOP (LEN(@Input)) ROW_NUMBER() OVER(ORDER BY N1.N)
FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N1 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N2 (N) -- 100
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N3 (N) -- 1,000
CROSS JOIN (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) AS N4 (N) -- 10,000
)
SELECT Stripped = x.data.value('.', 'VARCHAR(MAX)')
FROM ( SELECT SUBSTRING(@Input, n.Number, 1)
FROM Numbers n
WHERE n.Number <= LEN(@Input)
AND SUBSTRING(@Input, n.Number, 1) LIKE '[0-9]'
ORDER BY n.Number
FOR XML PATH(''), TYPE
) x (data)
);
GO
create function dbo.ExtractNumeric_UDF(@s varchar(8000))
returns varchar(8000)
as
begin
declare @out varchar(max) = ''
declare @c char(1)
while len(@s) > 0 begin
set @c = left(@s,1)
if @c like '[0123456789]' set @out += @c
set @s = substring(@s, 2, len(@s) -1)
end
return @out
end
GO
2. Create first set of sample data and log table
CREATE TABLE dbo.T (Value VARCHAR(8000) NOT NULL);
INSERT dbo.T (Value)
SELECT TOP 1000 LEFT(NEWID(), CEILING(RAND(CHECKSUM(NEWID())) * 36))
FROM sys.all_objects a
CROSS JOIN sys.all_objects b;
CREATE TABLE dbo.TestLog (Fx VARCHAR(255), NumberOfRows INT, TimeStart DATETIME2(7), TimeEnd DATETIME2(7))
3. Run Tests
GO
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_UDF', 1000, SYSDATETIME());
INSERT @T (Val)
SELECT dbo.ExtractNumeric_UDF(Value)
FROM dbo.T;
UPDATE dbo.TestLog
SET TimeEnd = SYSDATETIME()
WHERE TimeEnd IS NULL;
GO 100
DECLARE @T TABLE (Val VARCHAR(8000));
INSERT dbo.TestLog (fx, NumberOfRows, TimeStart)
VALUES ('dbo.ExtractNumeric_TVF', 1000, SYSDATETIME());
INSERT @T (Val)
SELECT f.Stripped
FROM dbo.T
CROSS APPLY dbo.ExtractNumeric_TVF(Value) f;
UPDATE dbo.TestLog
SET TimeEnd = SYSDATETIME()
WHERE TimeEnd IS NULL;
GO 100
4. Get Results
SELECT Fx,
NumberOfRows,
RunTime = AVG(DATEDIFF(MILLISECOND, TimeStart, TimeEnd))
FROM dbo.TestLog
GROUP BY fx, NumberOfRows;
I did the following (using just NEWID()
so only a maximum of 36 characters) over 1,000 and 10,000 rows, the results were:
Fx NumberOfRows RunTime
--------------------------------------------------------
dbo.ExtractNumeric_TVF 1000 31
dbo.ExtractNumeric_UDF 1000 56
dbo.ExtractNumeric_TVF 10000 280
dbo.ExtractNumeric_UDF 10000 510
So the TVF coming in at just under half the time of the UDF.
I wanted to test edge cases so put 1,000 rows of longer strings (5,400 characters)
TRUNCATE TABLE dbo.T;
INSERT dbo.T (Value)
SELECT TOP 1000
REPLICATE(CONCAT(NEWID(), NEWID(), NEWID(), NEWID(), NEWID()), 30)
FROM sys.all_objects a
CROSS JOIN sys.all_objects b;
And this is where the TVF came into its own, running over 5x faster:
Fx NumberOfRows RunTime
------------------------------------------------
dbo.ExtractNumeric_TVF 1000 2485
dbo.ExtractNumeric_UDF 1000 12955