0

I have a table with 4 string columns in database, each row represent blocked user or group.

referrer | ip | userAgent | email

By group I mean that one of the columns (any) can have wild card (asterisk) which mean "block all of them"

for example this row

www.google.com | 127.0.0.2 | * | yahoo.com

Means that each user request with "google.com" as referrer, "127.0.0.2" asIP & "yahoo.com" as email, need to be blocked without consider UserAgent because it has wild card

The following code works in O(n) complexity, which is good enough for a small size table but my table contains over million rows

class BlacklistEntry {

    private String referrer, ip, userAgent, email;
    private static List<BlacklistEntry> cache = new ArrayList<>();

    BlacklistEntry(String referrer, String ip, String userAgent, String email) {
        this.referrer = referrer;
        this.ip = ip;
        this.userAgent = userAgent;
        this.email = email;
    }

    private static boolean isBlacklisted(String ref, String ip, String ue, String email) {
        final String MATCH_ALL = "*";
        return cache.stream()
            .filter(e ->
                (MATCH_ALL.equals(e.getReferrer()) || e.getReferrer().equals(ref)) &&
                (MATCH_ALL.equals(e.getIp()) || e.getIp().equals(ip)) &&
                (MATCH_ALL.equals(e.getUserAgent()) || e.getUserAgent().equals(ue)) &&
                (MATCH_ALL.equals(e.getEmail()) || e.getEmail().equals(email)))
            .findFirst()
            .isPresent();
    }

    public String getReferrer() {
        return referrer;
    }

    public void setReferrer(String referrer) {
        this.referrer = referrer;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getUserAgent() {
        return userAgent;
    }

    public void setUserAgent(String userAgent) {
        this.userAgent = userAgent;
    }

    public String getEmail() {
        return email;
    }

    public void setEmail(String email) {
        this.email = email;
    }

    public static void main(String[] args) {

        cache.add(new BlacklistEntry("google.com", "127.0.0.2", "Mozilla", "yahoo.com"));
        cache.add(new BlacklistEntry("r1.com", "127.0.0.3", "Mozilla", "*"));
        cache.add(new BlacklistEntry("r2.com", "127.0.0.4", "Mozilla", "yahoo.com"));

        System.out.println(isBlacklisted("r2.com", "127.0.0.4", "Mozilla", "yahoo.com"));
        System.out.println(isBlacklisted("r1.com", "127.0.0.3", "Mozilla", "sould be true"));
        System.out.println(isBlacklisted("*", "127.0.0.3", "*", "*"));
    }
}

Is there something better than O(n) ? should I consider using Lucene ?

whoopdedoo
  • 2,815
  • 23
  • 46
  • Your `equals()` and `hashCode()` methods are both broken. Your `equals` method should check for the wildcard character on "this" as well and your `hashCode()` method should react on the wildcards. Keep in mind that objects, which are equals, must return the same hashcode. – Progman Sep 17 '17 at 17:16
  • The crux of it when the hashing collection involves, the equals only gets it turn when the hashcode matches with the entries present in underline collection. – Rizwan Sep 17 '17 at 17:25
  • I know what contract says, but is hashCode needed if one is not using HashSet/HastMap ? – Betlista Sep 19 '17 at 09:55

3 Answers3

2

Thanks for answers

My first try was to get all permutations (using guava thanks for my teammate to make it clean&clear) which make it numberParameters^2 and check if the set in cache contains any of them

private static boolean check(Set cache, String ref, String ip, String ue, String mail) {
    return Sets.powerSet(ImmutableSet.of(0, 1, 2, 3)).stream().map(set -> {
        BlacklistKey key = new BlacklistKey("*", "*", "*", "*");
        for (Integer idx : set) {
            switch (idx) {
                case 0:
                    key.setReferrer(ref);
                    break;
                case 1:
                    key.setIp(ip);
                    break;
                case 2:
                    key.setUserAgent(ue);
                    break;
                case 3:
                    key.setEmail(mail);
            }
        }
        return key;
    }).anyMatch(keys::contains);
}

Ended up using Lucene

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;

class Blacklist {

    private static final String MATCH_ALL = "*";
    private static IndexSearcher cache;

    private enum Fields {
        REFERRER, IP, USER_AGENT, EMAIL
    }

    private static Document getDocument(String referrer, String ip, String userAgent, String email) {
        Document doc = new Document();
        doc.add(getStringField(referrer, Fields.REFERRER.name()));
        doc.add(getStringField(ip, Fields.IP.name()));
        doc.add(getStringField(userAgent, Fields.USER_AGENT.name()));
        doc.add(getStringField(email, Fields.EMAIL.name()));
        return doc;
    }

    private static StringField getStringField(String val, String field) {
        return new StringField(field, val, Field.Store.NO);
    }

    private static BooleanQuery createQuery(String referrer, String ip, String userAgent, String email) {
        return new BooleanQuery.Builder()
                .add(createBooleanQuery(Fields.REFERRER.name(), referrer), BooleanClause.Occur.FILTER)
                .add(createBooleanQuery(Fields.IP.name(), ip), BooleanClause.Occur.FILTER)
                .add(createBooleanQuery(Fields.USER_AGENT.name(), userAgent), BooleanClause.Occur.FILTER)
                .add(createBooleanQuery(Fields.EMAIL.name(), email), BooleanClause.Occur.FILTER)
                .build();
    }

    private static BooleanQuery createBooleanQuery(String key, String value) {
        return new BooleanQuery.Builder()
                .add(new TermQuery(new Term(key, value)), BooleanClause.Occur.SHOULD)
                .add(new TermQuery(new Term(key, MATCH_ALL)), BooleanClause.Occur.SHOULD)
                .build();
    }

    private static boolean isBlacklisted(String ref, String ip, String ue, String email) throws IOException {
        BooleanQuery query = createQuery(ref, ip, ue, email);
        return cache.search(query, 1).totalHits > 0;
    }


    public static void main(String[] args) throws IOException {
        RAMDirectory directory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()));
        writer.addDocument(getDocument("ref1", "127.0.0.ip1", "Mozilla UserAgent1", "email.com"));
        writer.addDocument(getDocument("ref2", "127.0.0.ip2", "Mozilla UserAgent2", "*"));
        writer.close();
        DirectoryReader reader = DirectoryReader.open(directory);
        cache = new IndexSearcher(reader);

        System.out.println(isBlacklisted("ref1", "127.0.0.ip1", "Mozilla UserAgent1", "email.com"));
        System.out.println(isBlacklisted("r2.com", "127.0.0.4", "Mozilla", "yahoo.com"));
        System.out.println(isBlacklisted("ref2", "127.0.0.ip2", "Mozilla UserAgent2", "this is ignored"));
        System.out.println(isBlacklisted("*", "127.0.0.ip2", "Mozilla UserAgent2", "*"));
    }
}
whoopdedoo
  • 2,815
  • 23
  • 46
1

I recommend you to use RDB or Lucene.

However, I think you can reduce the cost with using binary-tree. This guaranteed log(n) time cost.

Procedure

  1. Prepare BlacklistEntry class implementing Comparator (Java Platform SE 8 )
  2. Override methods equals, hashCode and implement compare for Comparator
  3. Declare TreeSet<BlacklistEntry> and append your data
  4. Execute BlacklistEntry#contains(object), which will return it's blacklisted or not.

TreeSet (Java Platform SE 7 )

This implementation provides guaranteed log(n) time cost for the basic operations (add, remove and contains).

Sample

BlacklistEntry

package org.stackoverflow.btree;

import java.util.Comparator;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
public class BlacklistEntry implements Comparator<BlacklistEntry> {

    private String referrer, ip, userAgent, email;

    @Override
    public boolean equals(Object obj) {

        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (!(obj instanceof BlacklistEntry))
            return false;

        BlacklistEntry other = (BlacklistEntry) obj;

        // You need to deal with "*" for all elements
        // There should be another implementations to solve troublesome
        // boiler-plate codes, for example to use reflection or commons.lang.
        if (email == null) {
            if (other.email != null) {
                return false;
            }
        } else if (!email.equals(other.email) && !(email.equals("*") || other.email.equals("*")) ) {
            return false;
        }

        if (ip == null) {
            if (other.ip != null) {
                return false;
            }
        } else if (!ip.equals(other.ip) && !(ip.equals("*") || other.ip.equals("*")) ) {
            return false;
        }

        if (referrer == null) {
            if (other.referrer != null) {
                return false;
            }
        } else if (!referrer.equals(other.referrer) && !(referrer.equals("*") || other.referrer.equals("*")) ) {
            return false;
        }

        if (userAgent == null) {
            if (other.userAgent != null) {
                return false;
            }
        } else if (!userAgent.equals(other.userAgent) && !(userAgent.equals("*") || other.userAgent.equals("*")) ) {
            return false;
        }

        return true;
    }

    // I just generated #hashCode with eclipse
    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((email == null) ? 0 : email.hashCode());
        result = prime * result + ((ip == null) ? 0 : ip.hashCode());
        result = prime * result + ((referrer == null) ? 0 : referrer.hashCode());
        result = prime * result + ((userAgent == null) ? 0 : userAgent.hashCode());
        return result;
    }

    // if left obj & right obj is equals == true
    //   --> "*" or just completely matching
    // else
    //   --> detect which is larger than the another one
    //   (this #compare method is used by binary-tree)
    public int compare(BlacklistEntry o1, BlacklistEntry o2) {
        if (o1.equals(o2)) {
            return 0;
        } else {
            return (o1.hashCode() < o2.hashCode()) ? -1 : 1;
        }
    }

}

Main

package org.stackoverflow.btree;

import java.util.TreeSet;

public class Main {

    static TreeSet<BlacklistEntry> cache = new TreeSet<BlacklistEntry>(new BlacklistEntry());

    public static boolean isBlacklisted(String ref, String ip, String ue, String email) {
        return isBlacklisted(new BlacklistEntry(ref, ip, ue, email));
    }
    public static boolean isBlacklisted(BlacklistEntry e) {
        return cache.contains(e);
    }
    public static void main(String[] args) {
        cache.add(new BlacklistEntry("www.google.com", "127.0.0.2", "*", "yahoo.com") );

        System.out.println(isBlacklisted("www.google.com", "127.0.0.1", "IE", "yahoo.com"));
        System.out.println(isBlacklisted("www.google.com", "127.0.0.2", "Chrome", "yahoo.com"));
        System.out.println(isBlacklisted("www.google.com", "127.0.0.3", "Edge", "yahoo.com"));
        System.out.println(isBlacklisted("www.google.com", "127.0.0.4", "Mozilla", "yahoo.com"));
        System.out.println(isBlacklisted("www.google.com", "127.0.0.5", "Ruby", "yahoo.com"));
    }

}

Output

false
true
false
false
false
hiropon
  • 1,675
  • 2
  • 18
  • 41
  • I'm just guessing, but while there is random ordering (based on hash) this won't work for bigger amount of data. My guess is, that if you add 20 items to set you'll have strange outcomes... – Betlista Sep 19 '17 at 07:25
  • @Betlista Hmm, I will check it privately – hiropon Sep 19 '17 at 09:13
  • @hiropon would glad to know results, looks interesting although I ended up using lucene – whoopdedoo Sep 19 '17 at 09:16
  • 1
    @IddoE I tested several access logs ( I registered 5000 blacklist entries) . It looks working. If object's hashcode are duplicated . It will be problem. But, this case works because it requires match or not matching instead of comparing a and b. https://github.com/Hiroyuki-Nagata/stackoverflow/blob/master/src/main/java/org/stackoverflow/btree/Main.java – hiropon Sep 27 '17 at 05:39
1

Given that for each row, just one of the four columns can have a wildcard, one simple approach is to have five hash sets: one set matching all columns, and one set for each wildcard column that matches the three non-wildcard columns. Then, checking the blacklist consists looking up the entry in each of the five sets, which is asymptotic constant time (O(1)).

Proof of concept code (which can certainly be improved):

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;

public class Blacklist {

    private static class BlacklistEntry {
        private String[] fields;

        private BlacklistEntry(String... fields) {
            this.fields = fields;
        }

        @Override
        public int hashCode() {
            return Objects.hash(fields);
        }

        @Override
        public boolean equals(final Object obj) {
            return obj instanceof BlacklistEntry && Arrays.equals(fields, ((BlacklistEntry)obj).fields);
        }
    }

    private static List<Set<BlacklistEntry>> wildcardSets = new ArrayList<>();

    private static final int FIELD_COUNT = 4;

    static {
        for (int i = 0; i < FIELD_COUNT; i++) {
            wildcardSets.add(new HashSet<>());
        }
    }
    private static Set<BlacklistEntry> fullMatchSet = new HashSet<>();

    private static boolean isBlacklisted(String... fields) {
        if (fullMatchSet.contains(new BlacklistEntry(fields))) {
            return true;
        }
        for (int i = 0; i < FIELD_COUNT; i++) {
            if (wildcardSets.get(i).contains(new BlacklistEntry(removeField(fields, i)))) {
                return true;
            }
        }
        return false;
    }

    private static void addEntry(BlacklistEntry entry) {
        if (entry.fields.length != FIELD_COUNT) {
            throw new RuntimeException("Invalid entry, should contain four fields");
        }
        for (int i = 0; i < FIELD_COUNT; i++) {
            if ("*".equals(entry.fields[i])) {
                // Make a copy of the fields list without the wildcard, and add it to the correct set.
                String[] fields = removeField(entry.fields, i);
                wildcardSets.get(i).add(new BlacklistEntry(fields));
                return;
            }
        }
        // No wildcards in field list.
        fullMatchSet.add(entry);
    }

    private static String[] removeField(final String[] fields, final int index) {
        String[] newFields = new String[fields.length - 1];
        for (int i = 0; i < index; i++) {
            newFields[i] = fields[i];
        }
        for (int i = index + 1; i < FIELD_COUNT; i++) {
            newFields[i - 1] = fields[i];
        }
        return newFields;
    }


    public static void main(String[] args) {

        addEntry(new BlacklistEntry("r1.com", "127.0.0.1", "UA1", "*"));
        addEntry(new BlacklistEntry("r2.com", "127.0.0.2", "*", "e1.com"));
        addEntry(new BlacklistEntry("r3.com", "*", "UA2", "e2.com"));
        addEntry(new BlacklistEntry("*", "127.0.0.3", "UA3", "e3.com"));
        addEntry(new BlacklistEntry("r4.com", "127.0.0.4", "UA4", "e4.com"));

        // All these should return true
        System.out.println(isBlacklisted("r1.com", "127.0.0.1", "UA1", "wildcard"));
        System.out.println(isBlacklisted("r2.com", "127.0.0.2", "wildcard", "e1.com"));
        System.out.println(isBlacklisted("r3.com", "wildcard", "UA2", "e2.com"));
        System.out.println(isBlacklisted("wildcard", "127.0.0.3", "UA3", "e3.com"));
        System.out.println(isBlacklisted("r4.com", "127.0.0.4", "UA4", "e4.com"));

        // All these should return false
        System.out.println(isBlacklisted("r1.com", "127.0.0.1", "no match", "wildcard"));
        System.out.println(isBlacklisted("no match", "127.0.0.2", "wildcard", "e1.com"));
        System.out.println(isBlacklisted("r3.com", "wildcard", "no match", "e2.com"));
        System.out.println(isBlacklisted("wildcard", "127.0.0.3", "UA3", "no match"));
        System.out.println(isBlacklisted("r5.com", "127.0.0.5", "UA4", "e5.com"));
    }

}
markusk
  • 6,477
  • 34
  • 39