I am looking for a way to identify similar records in the Oracle 11.2 table within a sliding 3 sec window. There are approximately 500K rows in the table inserted within 24 hours period.
Requirements:
- These records should satisfy a similarity score of at least 88% using UTL_MATCH.JARO_WINKLER_SIMILARITY
- The FLAG column should be updated for a given record if there is at least one more similar record exists within three seconds window
Table definition:
CREATE TABLE ADDR_TAB
( DT DATE NOT NULL,
ADDR VARCHAR2(200) NOT NULL,
FLAG INT
);
CREATE INDEX ADDR_DATE_IDX ON ADDR_TAB(DT);
Sample data:
insert into addr_tab values (to_date('03-OCT-13 04.36.57 PM','DD-MON-RR HH.MI.SS AM'),'test',null);
insert into addr_tab values (to_date('03-OCT-13 04.36.57 PM','DD-MON-RR HH.MI.SS AM'),'test123',null);
insert into addr_tab values (to_date('03-OCT-13 04.36.58 PM','DD-MON-RR HH.MI.SS AM'),'2test2',null);
insert into addr_tab values (to_date('03-OCT-13 04.36.58 PM','DD-MON-RR HH.MI.SS AM'),'12test',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.00 PM','DD-MON-RR HH.MI.SS AM'),'12test',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.02 PM','DD-MON-RR HH.MI.SS AM'),'12test',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.03 PM','DD-MON-RR HH.MI.SS AM'),'1test87',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.03 PM','DD-MON-RR HH.MI.SS AM'),'12test',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.03 PM','DD-MON-RR HH.MI.SS AM'),'12test56',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.04 PM','DD-MON-RR HH.MI.SS AM'),'12test88',null);
insert into addr_tab values (to_date('03-OCT-13 04.37.05 PM','DD-MON-RR HH.MI.SS AM'),'12test56',null);
SQLFiddle: http://sqlfiddle.com/#!4/1b53f/1