1

I have a table that is storing groups of related rows, the different rows are related via a groupIdentifier column. Groups can be any number of rows in size.

I need to be able to pass in a new set of groups of rows and then find a mapping of new to existing matching groups. The complication is that the order of each row within the group is defined by a rowOrdinal value and must be taken into account. That rowOrdinal value is not always 0 based but the rows within a group are sorted by that value. Also @existingData contains 100s of thousands of potential groups, so the query needs to be performant

Here is an example input dataset:

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X')

-- @newData group 1 matches to @existingData group 100, @newData group 2 has no match in existingData

The desired result is a result set with two columns, existingGroupIdentifier and newGroupIdentifier. In this case the only result row would be 100, 1. The 100 being the @existingData groupIdentifier and the 1 being the @newData groupIdentifier

Edit The following is what I have come up with so far, by assuming I will ever have a max group size of N, I can manually copy paste tsql code that uses pivot and temp tables to do the comparison for each group size. BUT, this limits the system to N, seems ugly, and I would prefer a way to do it in a single query if possible

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X'),

    (3, 99, 'Y'),

    (5, 4, 'A'),
    (5, 10, 'B'),
    (5, 200, 'C')


-- First build table of the size of each group, limiting @existingData to only potentially matching groups (have at least one member in common)
declare @potentialGroupsInExistingData table (groupIdentifier int, groupSize int)

insert into @potentialGroupsInExistingData
    select
        ExistingData.groupIdentifier, COUNT(ExistingData.groupIdentifier)
    from
        @existingData ExistingData
    where
        exists (select top 1 * from @newData where value = ExistingData.value)
    group by ExistingData.groupIdentifier

declare @groupsInNewData table (groupIdentifier int, groupSize int)

insert into @groupsInNewData
    select
        NewData.groupIdentifier, COUNT(NewData.groupIdentifier)
    from
        @newData NewData
    group by NewData.groupIdentifier


-- handle groups of size one, this is a simpler case of the pivoting used with more than size 1 groups
-----------------------------------
select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @potentialGroupsInExistingData PotentialExistingGroup
    cross join @groupsInNewData GroupsInNewData
    inner join @existingData ExistingData on
        ExistingData.groupIdentifier = PotentialExistingGroup.groupIdentifier
    inner join @newData NewData on
        NewData.groupIdentifier = GroupsInNewData.groupIdentifier
        and NewData.value = ExistingData.value
where
    PotentialExistingGroup.groupSize = 1
    and GroupsInNewData.groupSize = 1


-- handle groups of size two
-----------------------------------
declare @existingGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))

insert into @existingGroupsOfSizeTwo 
    select
        *
    from
        (select
            ExistingData.groupIdentifier,
            ExistingData.value,
            ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
        from
            @potentialGroupsInExistingData PotentialGroup
            inner join @existingData ExistingData on
                ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
        where
            PotentialGroup.groupSize = 2) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p

declare @newGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))

insert into @newGroupsOfSizeTwo
    select
        *
    from
        (select
            NewData.groupIdentifier,
            NewData.value,
            ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
        from
            @groupsInNewData NewDataGroup
            inner join @newData NewData on
                NewData.groupIdentifier = NewDataGroup.groupIdentifier
        where
            NewDataGroup.groupSize = 2) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p

select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @newGroupsOfSizeTwo NewData
    inner join @existingGroupsOfSizeTwo ExistingData on
        ExistingData.valueOne = NewData.valueOne
        and ExistingData.valueTwo = NewData.valueTwo


-- handle groups of size three
-----------------------------------
declare @existingGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))

insert into @existingGroupsOfSizeThree 
    select
        *
    from
        (select
            ExistingData.groupIdentifier,
            ExistingData.value,
            ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
        from
            @potentialGroupsInExistingData PotentialGroup
            inner join @existingData ExistingData on
                ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
        where
            PotentialGroup.groupSize = 3) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p

declare @newGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))

insert into @newGroupsOfSizeThree
    select
        *
    from
        (select
            NewData.groupIdentifier,
            NewData.value,
            ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
        from
            @groupsInNewData NewDataGroup
            inner join @newData NewData on
                NewData.groupIdentifier = NewDataGroup.groupIdentifier
        where
            NewDataGroup.groupSize = 3) as T
    pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p

select
    ExistingData.groupIdentifier as ExistingGroupIdentifier,
    NewData.groupIdentifier as NewGroupIdentifier
from
    @newGroupsOfSizeThree NewData
    inner join @existingGroupsOfSizeThree ExistingData on
        ExistingData.valueOne = NewData.valueOne
        and ExistingData.valueTwo = NewData.valueTwo
        and ExistingData.valueThree = NewData.valueThree
Vladimir Baranov
  • 31,799
  • 5
  • 53
  • 90
BrandonAGr
  • 5,827
  • 5
  • 47
  • 72

2 Answers2

2

General idea

The given tables can have several rows for the same group ID. If we had a method to converge the given tables in such a way that they had one row for each group ID plus all the values of the group in one column, then it would become trivial to find all matching groups.

If we did this transformation

@existingData -> @ExistingDataGrouped (ID, DataValues)

@newData -> @NewDataGrouped (ID, DataValues)

then the final query would look like this (note that we are joining on DataValues, not ID):

SELECT
    E.ID, N.ID
FROM
    @ExistingDataGrouped AS E
    INNER JOIN @NewDataGrouped AS N ON N.DataValues = E.DataValues

How to make the grouped tables

Some optimization

If the number of source rows is significant, it is possible to do some preliminary filtering by using CHECKSUM_AGG.

WITH
CTE_ExistingRN
AS
(
    SELECT
        GroupIdentifier
        ,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
        ,Value
    FROM @ExistingData
)
,CTE_NewRN
AS
(
    SELECT
        GroupIdentifier
        ,ROW_NUMBER() OVER(PARTITION BY GroupIdentifier ORDER BY RowOrdinal) AS rn
        ,Value
    FROM @NewData
)
,CTE_ExistingAgg
AS
(
    SELECT
        GroupIdentifier
        , CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
    FROM CTE_ExistingRN
    GROUP BY GroupIdentifier
)
,CTE_NewAgg
AS
(
    SELECT
        GroupIdentifier
        , CHECKSUM_AGG(CHECKSUM(rn, Value)) AS DataValues
    FROM CTE_NewRN
    GROUP BY GroupIdentifier
)
SELECT
    CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
    , CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
    CTE_ExistingAgg
    INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;

At first we re-number all rows so that each group starts from 1 (CTE_ExistingRN and CTE_NewRN).

CHECKSUM(rn, Value) returns some integer for each source row taking into account the row number and its value. Different values would usually produce different checksums.

CHECKSUM_AGG groups all checksums together.

Result set:

ExistingGroupIdentifier    NewGroupIdentifier
100                        1
100                        2

This result would contain all groups that match exactly (100, 1), and it also can contain some groups that do not match, but by chance their checksums happened to be the same (100, 2). That's why this step is preliminary. To get accurate results you should compare actual values, not their checksums. But this step may filter out a significant number of groups that definitely don't match.

Solution using XML

This solution converts values of each group into XML and would provide accurate results. I personally never used FOR XML before and was curious to see how it works.

WITH
CTE_ExistingGroups
AS
(
    SELECT DISTINCT GroupIdentifier
    FROM @ExistingData
)
,CTE_NewGroups
AS
(
    SELECT DISTINCT GroupIdentifier
    FROM @NewData
)
,CTE_ExistingAgg
AS
(
    SELECT
        GroupIdentifier
        ,CA_Data.XML_Value AS DataValues
    FROM
        CTE_ExistingGroups
        CROSS APPLY
        (
            SELECT Value+','
            FROM @ExistingData
            WHERE GroupIdentifier = CTE_ExistingGroups.GroupIdentifier
            ORDER BY RowOrdinal FOR XML PATH(''), TYPE
        ) AS CA_XML(XML_Value)
        CROSS APPLY
        (
            SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
        ) AS CA_Data(XML_Value)
)
,CTE_NewAgg
AS
(
    SELECT
        GroupIdentifier
        ,CA_Data.XML_Value AS DataValues
    FROM
        CTE_NewGroups
        CROSS APPLY
        (
            SELECT Value+','
            FROM @NewData
            WHERE GroupIdentifier = CTE_NewGroups.GroupIdentifier
            ORDER BY RowOrdinal FOR XML PATH(''), TYPE
        ) AS CA_XML(XML_Value)
        CROSS APPLY
        (
            SELECT CA_XML.XML_Value.value('.', 'NVARCHAR(MAX)')
        ) AS CA_Data(XML_Value)
)
SELECT
    CTE_ExistingAgg.GroupIdentifier AS ExistingGroupIdentifier
    , CTE_NewAgg.GroupIdentifier AS NewGroupIdentifier
FROM
    CTE_ExistingAgg
    INNER JOIN CTE_NewAgg ON CTE_NewAgg.DataValues = CTE_ExistingAgg.DataValues
;

Result set:

ExistingGroupIdentifier    NewGroupIdentifier
100                        1
Community
  • 1
  • 1
Vladimir Baranov
  • 31,799
  • 5
  • 53
  • 90
  • Thanks for the detailed explanation! Converting the group of data values into a format that can be directly compared with one equality check sounds like the right approach – BrandonAGr Feb 13 '15 at 06:58
0

Try this:

declare @existingData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @existingData values 
    (100, 0, 'X'),
    (100, 1, 'Y'),

    (200, 0, 'A'),
    (200, 1, 'B'),
    (200, 2, 'C'),

    (40, 0, 'X'),

    (41, 0, 'Y')


declare @newData table (
    groupIdentifier int,
    rowOrdinal int,
    value varchar(1))

insert into @newData values 
    (1, 55, 'X'),
    (1, 59, 'Y'),

    (2, 0, 'Y'),
    (2, 1, 'X')

declare @results table (
    existingGID int,
    newGID int)

DECLARE @existingGroupID int
DECLARE outer_cursor CURSOR FOR
SELECT DISTINCT groupIdentifier FROM @existingData
OPEN outer_cursor
FETCH NEXT FROM outer_cursor INTO @existingGroupID
WHILE @@FETCH_STATUS = 0
BEGIN
    DECLARE @existingGroupCount int
    SELECT @existingGroupCount = COUNT(value) FROM @existingData WHERE groupIdentifier = @existingGroupID
    DECLARE @newGroupID int
    DECLARE inner_cursor CURSOR FOR
    SELECT DISTINCT groupIdentifier from @newData
    OPEN inner_cursor
    FETCH NEXT FROM inner_cursor INTO @newGroupID
    WHILE @@FETCH_STATUS = 0
    BEGIN
        DECLARE @newGroupCount int
        SELECT @newGroupCount = COUNT(value) FROM @newData WHERE groupIdentifier = @newGroupID
        -- if groups are different sizes, skip
        IF @newGroupCount = @existingGroupCount
        BEGIN
            DECLARE @newStart int = -1
            DECLARE @currentValue varchar(1)
            DECLARE @validGroup bit = 1
            DECLARE equality_cursor CURSOR FOR
            SELECT value FROM @existingData WHERE groupIdentifier = @existingGroupID ORDER BY rowOrdinal
            OPEN equality_cursor
            FETCH NEXT FROM equality_cursor INTO @currentValue
            WHILE @@FETCH_STATUS = 0
            BEGIN
                DECLARE @newValue varchar(1)
                SELECT TOP 1 @newValue = value, @newStart = rowOrdinal FROM @newData WHERE groupIdentifier = @newGroupID AND @newStart < rowOrdinal ORDER BY rowOrdinal
                IF(@newValue <> @currentValue)
                BEGIN
                    SET @validGroup = 0
                    BREAK
                END
                FETCH NEXT FROM equality_cursor INTO @currentValue
            END
            CLOSE equality_cursor
            DEALLOCATE equality_cursor
            IF @validGroup = 1
            BEGIN
                INSERT INTO @results (existingGID, newGID) VALUES (@existingGroupID, @newGroupID)
            END
        END
        FETCH NEXT FROM inner_cursor INTO @newGroupID
    END
    CLOSE inner_cursor
    DEALLOCATE inner_cursor
    FETCH NEXT FROM outer_cursor INTO @existingGroupID
END
CLOSE outer_cursor
DEALLOCATE outer_cursor

SELECT * FROM @results

I need to get going, but I'll edit this later with better comments to explain what the code does.

ristonj
  • 1,590
  • 1
  • 12
  • 15