I need to recursively JOIN a large graph that can have cycles.
Now, a dumbed down version of that looks like this in SQL-Server:
-- Array:10'000: 1.6s (postgresql)
-- LIKE: 10'000: 19s (sql-server)
-- JSON: 10'000: 19s (sql-server)
-- XML : 10'000: +infinity s (sql-server)
-- XML : 300 (a): 17s (nullable) (sql-server)
-- XML : 300 (b): 17s (not nullable) (sql-server)
-- XML : 300 (c): 23s (nullable, xpath) (sql-server)
;WITH CTE AS
(
SELECT
1 AS i
,CAST(N',1,' AS national character varying(MAX)) AS paths
UNION ALL
SELECT
CTE.i+1 AS i
,CTE.paths + CAST((CTE.i + 1) AS national character varying(36)) + N',' AS paths
FROM CTE
WHERE CTE.i < 10000
AND CTE.paths LIKE (N'%,' + CAST((CTE.i) AS national character varying(36)) + N',%')
)
SELECT i FROM CTE
OPTION (MAXRECURSION 0)
Since SQL-Server doesn't support arrays, I tried to optimize by using JSON:
;WITH CTE AS
(
SELECT
1 AS i
,CAST('[1]' AS nvarchar(MAX)) AS paths
UNION ALL
SELECT
CTE.i+1 AS i
,JSON_MODIFY(CTE.paths, N'append $', CTE.i+1) AS paths
FROM CTE
WHERE CTE.i < 1000
AND CTE.i IN
(
SELECT
nda.value
-- OpenJSON: ALTER DATABASE <db_name()> SET COMPATIBILITY_LEVEL = 130
FROM OPENJSON(CTE.paths, N'$') AS nda
)
)
SELECT i FROM CTE
OPTION (MAXRECURSION 0)
but JSON isn't any faster, plus it only works with SQL-Server 2016+.
I also tried with XML:
;WITH CTE AS
(
SELECT
1 AS i
,CAST(N'<table xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><row>1</row>' AS nvarchar(MAX)) AS paths
UNION ALL
SELECT
CTE.i+1 AS i
-- <row xsi:nil="true"/>
,CTE.paths + N'<row>' + CAST((CTE.i + 1) AS nvarchar(36)) + N'</row>' AS paths
FROM CTE
WHERE CTE.i < 300
AND CTE.i IN
(
SELECT
xmlRow.xmlElement.value(N'.', N'int') AS v
FROM
(
SELECT CAST(CTE.paths + N'</table>' AS xml) AS xmlArray
) AS tXmlArrayConverter
CROSS APPLY tXmlArrayConverter.xmlArray.nodes(N'//row') AS xmlRow(xmlElement)
)
)
SELECT i FROM CTE
OPTION (MAXRECURSION 0)
and while that would work with sql-server < 2016, this is just too damn slow - far worse than like.
The only way to get a decent performance has been switching to PostgreSQL, which supports arrays:
;WITH RECURSIVE CTE AS
(
SELECT
1 AS i
,array[1] AS paths
UNION ALL
SELECT
CTE.i+1 AS i
,CTE.paths || (CTE.i + 1) AS paths
FROM CTE
WHERE CTE.i < 10000
AND CTE.i = ANY(CTE.paths)
-- AND CTE.i <> ALL(CTE.paths)
)
SELECT i FROM CTE
this completes in 1.6s, which is about as good as I would expect.
Is there any way to improve performance in SQL-Server ?
Note:
This dumbed-down example with CTE.i IN (whatever)
is not supposed to make any sense, it's just for speed comparison.