10

This code removes duplicates from the original list, but I want to extract the duplicates from the original list -> not removing them (this package name is just part of another project):

Given:

a Person pojo:

package at.mavila.learn.kafka.kafkaexercises;

import org.apache.commons.lang3.builder.ToStringBuilder;

public class Person {

private final Long id;
private final String firstName;
private final String secondName;


private Person(final Builder builder) {
    this.id = builder.id;
    this.firstName = builder.firstName;
    this.secondName = builder.secondName;
}


public Long getId() {
    return id;
}

public String getFirstName() {
    return firstName;
}

public String getSecondName() {
    return secondName;
}

public static class Builder {

    private Long id;
    private String firstName;
    private String secondName;

    public Builder id(final Long builder) {
        this.id = builder;
        return this;
    }

    public Builder firstName(final String first) {
        this.firstName = first;
        return this;
    }

    public Builder secondName(final String second) {
        this.secondName = second;
        return this;
    }

    public Person build() {
        return new Person(this);
    }


}

@Override
public String toString() {
    return new ToStringBuilder(this)
            .append("id", id)
            .append("firstName", firstName)
            .append("secondName", secondName)
            .toString();
}
}

Duplication extraction code.

Notice here we filter the id and the first name to retrieve a new list, I saw this code someplace else, not mine:

package at.mavila.learn.kafka.kafkaexercises;

import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static java.util.Objects.isNull;

public final class DuplicatePersonFilter {


private DuplicatePersonFilter() {
    //No instances of this class
}

public static List<Person> getDuplicates(final List<Person> personList) {

   return personList
           .stream()
           .filter(duplicateByKey(Person::getId))
           .filter(duplicateByKey(Person::getFirstName))
           .collect(Collectors.toList());

}

private static <T> Predicate<T> duplicateByKey(final Function<? super T, Object> keyExtractor) {
    Map<Object,Boolean> seen = new ConcurrentHashMap<>();
    return t -> isNull(seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE));

}

}

The test code. If you run this test case you will get [alex, lolita, elpidio, romualdo].

I would expect to get instead [romualdo, otroRomualdo] as the extracted duplicates given the id and the firstName:

package at.mavila.learn.kafka.kafkaexercises;


import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.*;

public class DuplicatePersonFilterTest {

private static final Logger LOGGER = LoggerFactory.getLogger(DuplicatePersonFilterTest.class);



@Test
public void testList(){

    Person alex = new Person.Builder().id(1L).firstName("alex").secondName("salgado").build();
    Person lolita = new Person.Builder().id(2L).firstName("lolita").secondName("llanero").build();
    Person elpidio = new Person.Builder().id(3L).firstName("elpidio").secondName("ramirez").build();
    Person romualdo = new Person.Builder().id(4L).firstName("romualdo").secondName("gomez").build();
    Person otroRomualdo = new Person.Builder().id(4L).firstName("romualdo").secondName("perez").build();


    List<Person> personList = new ArrayList<>();

    personList.add(alex);
    personList.add(lolita);
    personList.add(elpidio);
    personList.add(romualdo);
    personList.add(otroRomualdo);

    final List<Person> duplicates = DuplicatePersonFilter.getDuplicates(personList);

    LOGGER.info("Duplicates: {}",duplicates);

}

}

In my job I was able to get the desired result it by using Comparator using TreeMap and ArrayList, but this was creating a list then filtering it, passing the filter again to a newly created list, this looks bloated code, (and probably inefficient)

Does someone has a better idea how to extract duplicates?, not remove them.

Thanks in advance.

Update

Thanks everyone for your answers

To remove the duplicate using same approach with the uniqueAttributes:

  public static List<Person> removeDuplicates(List<Person> personList) {
    return getDuplicatesMap(personList).values().stream()
            .filter(duplicates -> duplicates.size() > 1)
            .flatMap(Collection::stream)
            .collect(Collectors.toList());
}

private static Map<String, List<Person>> getDuplicatesMap(List<Person> personList) {
    return personList.stream().collect(groupingBy(DuplicatePersonFilter::uniqueAttributes));
}

private static String uniqueAttributes(Person person){

    if(Objects.isNull(person)){
        return StringUtils.EMPTY;
    }

    return (person.getId()) + (person.getFirstName()) ;
}

Update 2

But also the answer provided by @brett-ryan is correct:

public static List<Person> extractDuplicatesWithIdentityCountingV2(final List<Person> personList){

        List<Person> duplicates = personList.stream()
                .collect(Collectors.groupingBy(Function.identity(), Collectors.counting()))
                .entrySet().stream()
                .filter(n -> n.getValue() > 1)
                .flatMap(n -> nCopies(n.getValue().intValue(), n.getKey()).stream())
                .collect(toList());

        return duplicates;

    }

EDIT

Above code can be found under:

https://gitlab.com/totopoloco/marco_utilities/-/tree/master/duplicates_exercises

Please see:

Usage: https://gitlab.com/totopoloco/marco_utilities/-/blob/master/duplicates_exercises/src/test/java/at/mavila/exercises/duplicates/lists/DuplicatePersonFilterTest.java

Implementation: https://gitlab.com/totopoloco/marco_utilities/-/blob/master/duplicates_exercises/src/main/java/at/mavila/exercises/duplicates/lists/DuplicatePersonFilter.java

  • 7
    I think you've posted too much code here. Typically keep it to 10-20 lines if possible. It might be hard for someone to parse what you're doing here. – Tim Biegeleisen Nov 06 '18 at 02:27
  • *get instead [romualdo, otroRomualdo*...the other is not in the list even. – Naman Nov 06 '18 at 02:35
  • 3
    @nullpointer exactly, no idea why the upvotes for a pretty un-clear question – Eugene Nov 06 '18 at 02:36
  • Sorry if I was not clear, we have 5 objects in the list, passing through the method will return the first 4 elements, OK, but we would like to get element 3 and 4 (starting counting from 0), why?, because these two are duplicated in the list. As I was saying, somehow I got the result by extracting list from list with Comparator, TreeMap, but the code for me looked really bloated, I could share that, but I have it in the other laptop. I will share it as soon as I sit in the office. – Marco Tulio Avila Cerón Nov 06 '18 at 02:39
  • *In my job I was able to get the desired result it by using Comparator using TreeMap and ArrayList, but this was creating a list then filtering it*.... where is that code? I would've rather put that here with the explanation instead of what you've shared at present. – Naman Nov 06 '18 at 02:40
  • so you need two lists one with distinct values (right now you had) and the another with duplicate values, am i right? – Ryuzaki L Nov 06 '18 at 02:42
  • @Deadpool, yes, that is my desire. I don't care so much about the duplicates in really, but as you say for the community we have already one method. – Marco Tulio Avila Cerón Nov 06 '18 at 02:44
  • but you are identifying duplicates based on `id` and `firstname`? – Ryuzaki L Nov 06 '18 at 02:52
  • For the moment yes, if we would extend it to the last name we would get a list with 0 elements in that example. (another filter) – Marco Tulio Avila Cerón Nov 06 '18 at 02:53
  • About my last comment, i mean when finding duplicates, but it would return the same elements when removing duplicates. – Marco Tulio Avila Cerón Nov 06 '18 at 03:05
  • in one method it is tricky to get two different lists, prefer using two different methods for each case @MarcoTulioAvilaCerón – Ryuzaki L Nov 06 '18 at 03:20
  • @Deadpool yes yes there should be two methods in the final class (probably finding also a better name), your code below looks promising, i will give a try and give you an update – Marco Tulio Avila Cerón Nov 06 '18 at 03:23
  • Hello friends I found my code: it was doing something like this: personList.stream().collect(Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>( Comparator.comparingLong(Person::getId))),ArrayList::new)); //But this does not work – Marco Tulio Avila Cerón Nov 06 '18 at 12:18
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/183186/discussion-between-marco-tulio-avila-ceron-and-deadpool). – Marco Tulio Avila Cerón Nov 06 '18 at 12:22

7 Answers7

11

To indentify duplicates, no method I know of is better suited than Collectors.groupingBy(). This allows you to group the list into a map based on a condition of your choice.

Your condition is a combination of id and firstName. Let's extract this part into an own method in Person:

String uniqueAttributes() {
  return id + firstName;
}

The getDuplicates() method is now quite straightforward:

public static List<Person> getDuplicates(final List<Person> personList) {
  return getDuplicatesMap(personList).values().stream()
      .filter(duplicates -> duplicates.size() > 1)
      .flatMap(Collection::stream)
      .collect(Collectors.toList());
}

private static Map<String, List<Person>> getDuplicatesMap(List<Person> personList) {
  return personList.stream().collect(groupingBy(Person::uniqueAttributes));
}
  • The first line calls another method getDuplicatesMap() to create the map as explained above.
  • It then streams over the values of the map, which are lists of persons.
  • It filters out everything except lists with a size greater than 1, i.e. it finds the duplicates.
  • Finally, flatMap() is used to flatten the stream of lists into one single stream of persons, and collects the stream to a list.

An alternative, if you truly identify persons as equal if the have the same id and firstName is to go with the solution by Jonathan Johx and implement an equals() method.

Magnilex
  • 11,584
  • 9
  • 62
  • 84
5

If you could implement equals and hashCode on Person you could then use a counting down-stream collector of the groupingBy to get distinct elements that have been duplicated.

List<Person> duplicates = personList.stream()
  .collect(groupingBy(identity(), counting()))
  .entrySet().stream()
  .filter(n -> n.getValue() > 1)
  .map(n -> n.getKey())
  .collect(toList());

If you would like to keep a list of sequential repeated elements you can then expand this out using Collections.nCopies to expand it back out. This method will ensure repeated elements are ordered together.

List<Person> duplicates = personList.stream()
    .collect(groupingBy(identity(), counting()))
    .entrySet().stream()
    .filter(n -> n.getValue() > 1)
    .flatMap(n -> nCopies(n.getValue().intValue(), n.getKey()).stream())
    .collect(toList());
Brett Ryan
  • 26,937
  • 30
  • 128
  • 163
  • This code snippet is "correct" in the way that extracts a single element of duplicates, e.g. let's say that your list has 5 duplicates of "a" and 3 duplicates of "b", your snippet returns a list containing element "a" and "b" only. – Marco Tulio Avila Cerón Jun 12 '20 at 12:28
  • But the requirement is quite clear, if we use the same example, we need that a new list is returned of 5 "a" and 3 "b" to give a list length of 8 elements, but as I mentioned before your code snippet returns a list with only 2 elements, so it is not good, still the answer given by @Magnilex is the correct one. – Marco Tulio Avila Cerón Jun 12 '20 at 12:30
  • My apologies @MarcoTulioAvilaCerón, but it didn't seem that clean in the original post you provided. – Brett Ryan Jun 12 '20 at 12:36
  • No problem, I am putting all answers in this post in Gitlab in: https://gitlab.com/totopoloco/marco_utilities/-/tree/master/duplicates_exercises – Marco Tulio Avila Cerón Jun 12 '20 at 12:41
  • 1
    I've updated with an solution that uses Collections.nCopies. You could also achieve this with `IntStream.range(n, obj)` but it becomes less readable. – Brett Ryan Jun 12 '20 at 12:56
  • Thanks, this is a good solution that does not depend on a explicit unique identifier. – Marco Tulio Avila Cerón Jun 12 '20 at 13:49
4
List<Person> duplicates = personList.stream()
  .collect(Collectors.groupingBy(Person::getId))
  .entrySet().stream()
  .filter(e->e.getValue().size() > 1)
  .flatMap(e->e.getValue().stream())
  .collect(Collectors.toList());

That should give you a List of Person where the id has been duplicated.

YoYo
  • 9,157
  • 8
  • 57
  • 74
3

In this scenario you need to write your custom logic to extract the duplicates from the list, you will get all the duplicates in the Person list

   public static List<Person> extractDuplicates(final List<Person> personList) {

    return personList.stream().flatMap(i -> {
        final AtomicInteger count = new AtomicInteger();
        final List<Person> duplicatedPersons = new ArrayList<>();

        personList.forEach(p -> {

            if (p.getId().equals(i.getId()) && p.getFirstName().equals(i.getFirstName())) {
                count.getAndIncrement();
            }

            if (count.get() == 2) {
                duplicatedPersons.add(i);
            }

        });

        return duplicatedPersons.stream();
    }).collect(Collectors.toList());
}

Applied to:

 List<Person> l = new ArrayList<>();
           Person alex = new 
 Person.Builder().id(1L).firstName("alex").secondName("salgado").build();
            Person lolita = new 
 Person.Builder().id(2L).firstName("lolita").secondName("llanero").build();
            Person elpidio = new 
 Person.Builder().id(3L).firstName("elpidio").secondName("ramirez").build();
            Person romualdo = new 
 Person.Builder().id(4L).firstName("romualdo").secondName("gomez").build();
            Person otroRomualdo = new 
 Person.Builder().id(4L).firstName("romualdo").secondName("perez").build();
      l.add(alex);
      l.add(lolita);
      l.add(elpidio);
      l.add(romualdo);
      l.add(otroRomualdo);

Output:

[Person [id=4, firstName=romualdo, secondName=gomez], Person [id=4, firstName=romualdo, secondName=perez]]
DaveyDaveDave
  • 9,821
  • 11
  • 64
  • 77
Ryuzaki L
  • 37,302
  • 12
  • 68
  • 98
3

I think first you should overwrite equals method of Person class and focus on id and name. And after you can update it adding a filter for that.

@Override
public int hashCode() {
    return Objects.hash(id, name);
}

@Override
public boolean equals(Object obj) {
    if (this == obj) {
        return true;
    }
    if (obj == null) {
        return false;
    }
    if (getClass() != obj.getClass()) {
        return false;
    }
    final Person other = (Person) obj;
    if (!Objects.equals(name, other.name)) {
        return false;
    }
    if (!Objects.equals(id, other.id)) {
        return false;
    }
    return true;
}

 personList
       .stream() 
       .filter(p -> personList.contains(p))
       .collect(Collectors.toList());
Jonathan JOhx
  • 5,784
  • 2
  • 17
  • 33
1

Solution based on generic key:

public static <T> List<T> findDuplicates(List<T> list, Function<T, ?> uniqueKey) {
    if (list == null) {
        return emptyList();
    }
    Function<T, ?> notNullUniqueKey = el -> uniqueKey.apply(el) == null ? "" : uniqueKey.apply(el);
    return list.stream()
            .collect(groupingBy(notNullUniqueKey))
            .values()
            .stream()
            .filter(matches -> matches.size() > 1)
            .map(matches -> matches.get(0))
            .collect(toList());
}


// Example of usage:
List<Person> duplicates = findDuplicates(list, el -> el.getFirstName());
Leonid Dashko
  • 3,657
  • 1
  • 18
  • 26
0
List<Person> arr = new ArrayList<>();
arr.add(alex);
arr.add(lolita);
arr.add(elpidio);
arr.add(romualdo);
arr.add(otroRomualdo);

Set<String> set = new HashSet<>();
List<Person> result = arr.stream()
                         .filter(data -> (set.add(data.name +";"+ Long.toString(data.id)) == false))
                         .collect(Collectors.toList());
arr.removeAll(result);
Set<String> set2 = new HashSet<>();
result.stream().forEach(data -> set2.add(data.name +";"+ Long.toString(data.id)));
List<Person> resultTwo = arr.stream()
                            .filter(data -> (set2.add(data.name +";"+ Long.toString(data.id)) == false))
                            .collect(Collectors.toList());
result.addAll(resultTwo);

The above code will filter based on name and id. The result List will have all the duplicated Person Object