I have a table that is storing groups of related rows, the different rows are related via a groupIdentifier column. Groups can be any number of rows in size.
I need to be able to pass in a new set of groups of rows and then find a mapping of new to existing matching groups. The complication is that the order of each row within the group is defined by a rowOrdinal value and must be taken into account. That rowOrdinal value is not always 0 based but the rows within a group are sorted by that value. Also @existingData contains 100s of thousands of potential groups, so the query needs to be performant
Here is an example input dataset:
declare @existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare @newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X')
-- @newData group 1 matches to @existingData group 100, @newData group 2 has no match in existingData
The desired result is a result set with two columns, existingGroupIdentifier and newGroupIdentifier. In this case the only result row would be 100, 1. The 100 being the @existingData groupIdentifier and the 1 being the @newData groupIdentifier
Edit The following is what I have come up with so far, by assuming I will ever have a max group size of N, I can manually copy paste tsql code that uses pivot and temp tables to do the comparison for each group size. BUT, this limits the system to N, seems ugly, and I would prefer a way to do it in a single query if possible
declare @existingData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @existingData values
(100, 0, 'X'),
(100, 1, 'Y'),
(200, 0, 'A'),
(200, 1, 'B'),
(200, 2, 'C'),
(40, 0, 'X'),
(41, 0, 'Y')
declare @newData table (
groupIdentifier int,
rowOrdinal int,
value varchar(1))
insert into @newData values
(1, 55, 'X'),
(1, 59, 'Y'),
(2, 0, 'Y'),
(2, 1, 'X'),
(3, 99, 'Y'),
(5, 4, 'A'),
(5, 10, 'B'),
(5, 200, 'C')
-- First build table of the size of each group, limiting @existingData to only potentially matching groups (have at least one member in common)
declare @potentialGroupsInExistingData table (groupIdentifier int, groupSize int)
insert into @potentialGroupsInExistingData
select
ExistingData.groupIdentifier, COUNT(ExistingData.groupIdentifier)
from
@existingData ExistingData
where
exists (select top 1 * from @newData where value = ExistingData.value)
group by ExistingData.groupIdentifier
declare @groupsInNewData table (groupIdentifier int, groupSize int)
insert into @groupsInNewData
select
NewData.groupIdentifier, COUNT(NewData.groupIdentifier)
from
@newData NewData
group by NewData.groupIdentifier
-- handle groups of size one, this is a simpler case of the pivoting used with more than size 1 groups
-----------------------------------
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@potentialGroupsInExistingData PotentialExistingGroup
cross join @groupsInNewData GroupsInNewData
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialExistingGroup.groupIdentifier
inner join @newData NewData on
NewData.groupIdentifier = GroupsInNewData.groupIdentifier
and NewData.value = ExistingData.value
where
PotentialExistingGroup.groupSize = 1
and GroupsInNewData.groupSize = 1
-- handle groups of size two
-----------------------------------
declare @existingGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into @existingGroupsOfSizeTwo
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
@potentialGroupsInExistingData PotentialGroup
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
declare @newGroupsOfSizeTwo table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(2))
insert into @newGroupsOfSizeTwo
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
@groupsInNewData NewDataGroup
inner join @newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 2) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@newGroupsOfSizeTwo NewData
inner join @existingGroupsOfSizeTwo ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
-- handle groups of size three
-----------------------------------
declare @existingGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into @existingGroupsOfSizeThree
select
*
from
(select
ExistingData.groupIdentifier,
ExistingData.value,
ROW_NUMBER() over (partition by ExistingData.groupIdentifier order by ExistingData.rowOrdinal desc) as ActualOrdinal
from
@potentialGroupsInExistingData PotentialGroup
inner join @existingData ExistingData on
ExistingData.groupIdentifier = PotentialGroup.groupIdentifier
where
PotentialGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
declare @newGroupsOfSizeThree table (groupIdentifier int, valueOne varchar(1), valueTwo varchar(1), valueThree varchar(1))
insert into @newGroupsOfSizeThree
select
*
from
(select
NewData.groupIdentifier,
NewData.value,
ROW_NUMBER() over (partition by NewData.groupIdentifier order by NewData.rowOrdinal desc) as ActualOrdinal
from
@groupsInNewData NewDataGroup
inner join @newData NewData on
NewData.groupIdentifier = NewDataGroup.groupIdentifier
where
NewDataGroup.groupSize = 3) as T
pivot ( min(value) for T.ActualOrdinal in ([1], [2], [3]) ) as p
select
ExistingData.groupIdentifier as ExistingGroupIdentifier,
NewData.groupIdentifier as NewGroupIdentifier
from
@newGroupsOfSizeThree NewData
inner join @existingGroupsOfSizeThree ExistingData on
ExistingData.valueOne = NewData.valueOne
and ExistingData.valueTwo = NewData.valueTwo
and ExistingData.valueThree = NewData.valueThree