2

I'm trying to join two DataTables on a dynamic number of columns. I've gotten as far as the code below. The problem is the ON statement of the join. How can I make this dynamic based on how many column names are in the list "joinColumnNames".

I was thinking I will need to build some sort of expression tree, but I can't find any examples of how to do this with multiple join columns and with the DataRow object which doesn't have properties for each column.

private DataTable Join(List<string> joinColumnNames, DataTable pullX, DataTable pullY)
{
    DataTable joinedTable = new DataTable();

    // Add all the columns from pullX
    foreach (string colName in joinColumnNames)
    {
        joinedTable.Columns.Add(pullX.Columns[colName]);
    }
    // Add unique columns from PullY
    foreach (DataColumn col in pullY.Columns)
    {
        if (!joinedTable.Columns.Contains((col.ColumnName)))
        {
            joinedTable.Columns.Add(col);
        }
    }

    var Join = (from PX in pullX.AsEnumerable()
                join PY in pullY.AsEnumerable() on 
                // This must be dynamic and join on every column mentioned in joinColumnNames
                new { A = PX[joinColumnNames[0]], B = PX[joinColumnNames[1]] } equals new { A = PY[joinColumnNames[0]], B = PY[joinColumnNames[1]] } 
                into Outer
                from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
                select new { PX, PY });

    foreach (var item in Join)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var pullXValue = item.PX.Table.Columns.Contains(col.ColumnName) ? item.PX[col.ColumnName] : string.Empty;
            var pullYValue = item.PY.Table.Columns.Contains(col.ColumnName) ? item.PY[col.ColumnName] : string.Empty;
            newRow[col.ColumnName] = (pullXValue == null || string.IsNullOrEmpty(pullXValue.ToString())) ? pullYValue : pullXValue;
        }
        joinedTable.Rows.Add(newRow);
    }

    return joinedTable;
}

Adding a specific example to show input/output using 3 join columns (Country, Company, and DateId):

Pull X:

Country        Company       DateId    Sales
United States  Test1 Ltd    20160722    $25 

Canada         Test3 Ltd    20160723    $30 

Italy          Test4 Ltd    20160724    $40 

India          Test2 Ltd    20160725    $35 

Pull Y:

Country        Company       DateId    Downloads
United States  Test1 Ltd    20160722    500 

Mexico         Test2 Ltd    20160723    300 

Italy          Test4 Ltd    20160724    900 

Result:

Country        Company       DateId    Sales    Downloads
United States  Test1 Ltd    20160722    $25      500 

Canada         Test3 Ltd    20160723    $30  

Mexico         Test2 Ltd    20160723                300 

Italy          Test4 Ltd    20160724    $40      900 

India          Test2 Ltd    20160725    $35      
Uwe Keim
  • 39,551
  • 56
  • 175
  • 291
typheon
  • 138
  • 1
  • 1
  • 9
  • 1
    Can you give a sample input and output? What about the columns from "pullX" that are not in "joinColumnNames"? Don't you want to have these columns in the output? – Yacoub Massad Aug 05 '16 at 12:37
  • yes, the output should have all columns from x and y. I added a specific example. – typheon Aug 05 '16 at 18:34

2 Answers2

1
var Join = 
    from PX in pullX.AsEnumerable()
    join PY in pullY.AsEnumerable()
    on     string.Join("\0", joinColumnNames.Select(c => PX[c]))
    equals string.Join("\0", joinColumnNames.Select(c => PY[c]))
    into Outer
    from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
    select new { PX, PY };

Another way is to have both DataTable in a DataSet and use DataRelation
How To: Use DataRelation to perform a join on two DataTables in a DataSet?

Community
  • 1
  • 1
Slai
  • 22,144
  • 5
  • 45
  • 53
1

Since you are using LINQ to Objects, there is no need to use expression trees. You can solve your problem with a custom equality comparer.

Create an equality comparer that can compare equality between two DataRow objects based on the values of specific columns. Here is an example:

public class MyEqualityComparer : IEqualityComparer<DataRow>
{
    private readonly string[] columnNames;

    public MyEqualityComparer(string[] columnNames)
    {
        this.columnNames = columnNames;
    }

    public bool Equals(DataRow x, DataRow y)
    {
        return columnNames.All(cn => x[cn].Equals(y[cn]));
    }

    public int GetHashCode(DataRow obj)
    {
        unchecked
        {
            int hash = 19;
            foreach (var value in columnNames.Select(cn => obj[cn]))
            {
                hash = hash * 31 + value.GetHashCode();
            }
            return hash;
        }
    }
}

Then you can use it to make the join like this:

public class TwoRows
{
    public DataRow Row1 { get; set; }
    public DataRow Row2 { get; set; }
}

private static List<TwoRows> LeftOuterJoin(
    List<string> joinColumnNames,
    DataTable leftTable,
    DataTable rightTable)
{
    return leftTable
        .AsEnumerable()
        .GroupJoin(
            rightTable.AsEnumerable(),
            l => l,
            r => r,
            (l, rlist) => new {LeftValue = l, RightValues = rlist},
            new MyEqualityComparer(joinColumnNames.ToArray()))
        .SelectMany(
            x => x.RightValues.DefaultIfEmpty(rightTable.NewRow()),
            (x, y) => new TwoRows {Row1 = x.LeftValue, Row2 = y})
        .ToList();
}

Please note that I am using method syntax because I don't think that you can use a custom equality comparer otherwise.

Please note that the method does a left outer join, not a full outer join. Based on the example you provided, you seem to want a full outer join. To do this you need to do two left outer joins (see this answer). Here is how the full method would look like:

private static DataTable FullOuterJoin(
    List<string> joinColumnNames,
    DataTable pullX,
    DataTable pullY)
{
    var pullYOtherColumns =
        pullY.Columns
            .Cast<DataColumn>()
            .Where(x => !joinColumnNames.Contains(x.ColumnName))
            .ToList();

    var allColumns = 
        pullX.Columns
            .Cast<DataColumn>()
            .Concat(pullYOtherColumns)
            .ToArray();

    var allColumnsClone =
        allColumns
            .Select(x => new DataColumn(x.ColumnName, x.DataType))
            .ToArray();

    DataTable joinedTable = new DataTable();

    joinedTable.Columns.AddRange(allColumnsClone);

    var first =
        LeftOuterJoin(joinColumnNames, pullX, pullY);

    var resultRows = new List<DataRow>();

    foreach (var item in first)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var value = pullX.Columns.Contains(col.ColumnName)
                ? item.Row1[col.ColumnName]
                : item.Row2[col.ColumnName];

            newRow[col.ColumnName] = value;
        }
        resultRows.Add(newRow);
    }

    var second =
        LeftOuterJoin(joinColumnNames, pullY, pullX);

    foreach (var item in second)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var value = pullY.Columns.Contains(col.ColumnName)
                ? item.Row1[col.ColumnName]
                : item.Row2[col.ColumnName];

            newRow[col.ColumnName] = value;
        }
        resultRows.Add(newRow);
    }

    var uniqueRows =
        resultRows
            .Distinct(
                new MyEqualityComparer(
                    joinedTable.Columns
                        .Cast<DataColumn>()
                        .Select(x => x.ColumnName)
                        .ToArray()));

    foreach (var uniqueRow in uniqueRows)
        joinedTable.Rows.Add(uniqueRow);


    return joinedTable;
}

Please note also how I clone the columns. You cannot use the same column object in two tables.

Community
  • 1
  • 1
Yacoub Massad
  • 27,509
  • 2
  • 36
  • 62
  • Very thorough and complete solution, I like it. Thank you for pointing out the column clone issue and my left join instead of full outer. I ended up doing a very similar thing. I left join x and y, left join y and x, and then I used union. – typheon Aug 06 '16 at 00:17