This solution implements a backward reader.
It reads a file starting from the end by block of b.Len
bytes, then it look forwards for a separator, currently \n
within the block, it then advances the starting offset by SepIndex
(this is to prevent having the search string being split over two consecutive read). Before proceeding to the next block read, it lookups for the search
string within the block read, if found, it returns its starting position within the file and stops.
Otherwise, it reduces the start offset by b.Len
then read next block.
For as long as your search string is in the last 40% of the file, you should get better performance, but this is to be battle tested.
If your search string is within the last 10%, i am confident you will get a win.
main.go
package main
import (
"bytes"
"flag"
"fmt"
"io"
"log"
"os"
"time"
"github.com/mattetti/filebuffer"
)
func main() {
var search string
var sep string
var verbose bool
flag.StringVar(&search, "search", "findme", "search word")
flag.StringVar(&sep, "sep", "\n", "separator for the search detection")
flag.BoolVar(&verbose, "v", false, "verbosity")
flag.Parse()
d := make(chan struct{})
b := &bytes.Buffer{}
go func() {
io.Copy(b, os.Stdin)
d <- struct{}{}
}()
<-time.After(time.Millisecond)
select {
case <-d:
default:
os.Stdin.Close()
}
readSize := 1024
if b.Len() < 1 {
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), readSize-5), search)
input += input
b.WriteString(input)
}
bsearch := []byte(search)
s, err := bytesSearch(b.Bytes())
if err != nil {
log.Fatal(err)
}
if verbose {
s.logger = log.New(os.Stderr, "", log.LstdFlags)
}
s.Buffer = make([]byte, readSize)
s.Sep = []byte(sep)
got, err := s.Index(bsearch)
if err != nil {
log.Fatal(err)
}
fmt.Println("Index ", got)
got, err = s.Index2(bsearch)
if err != nil {
log.Fatal(err)
}
fmt.Println("Index ", got)
}
type tailSearch struct {
F io.ReadSeeker
Buffer []byte
Sep []byte
start int64
logger interface {
Println(...interface{})
}
}
func fileSearch(f *os.File) (ret tailSearch, err error) {
ret.F = f
st, err := f.Stat()
if err != nil {
return
}
ret.start = st.Size()
ret.Sep = []byte("\n")
return ret, nil
}
func bytesSearch(b []byte) (ret tailSearch, err error) {
ret.F = filebuffer.New(b)
ret.start = int64(len(b))
ret.Sep = []byte("\n")
return
}
func (b tailSearch) Index(search []byte) (int64, error) {
if b.Buffer == nil {
b.Buffer = make([]byte, 1024, 1024)
}
buf := b.Buffer
blen := len(b.Buffer)
hasended := false
for !hasended {
if b.logger != nil {
b.logger.Println("a start", b.start)
}
offset := b.start - int64(blen)
if offset < 0 {
offset = 0
hasended = true
}
_, err := b.F.Seek(offset, 0)
if err != nil {
hasended = true
}
n, err := b.F.Read(buf)
if b.logger != nil {
b.logger.Println("f n", n, "err", err)
}
if err != nil {
hasended = true
}
buf = buf[:n]
b.start -= int64(n)
if b.logger != nil {
b.logger.Println("g start", b.start)
}
if b.start > 0 {
i := bytes.Index(buf, b.Sep)
if b.logger != nil {
b.logger.Println("h sep", i)
}
if i > -1 {
b.start += int64(i)
buf = buf[i:]
if b.logger != nil {
b.logger.Println("i start", b.start)
}
}
}
if e := bytes.LastIndex(buf, search); e > -1 {
return b.start + int64(e), nil
}
}
return -1, nil
}
func (b tailSearch) Index2(search []byte) (int64, error) {
if b.Buffer == nil {
b.Buffer = make([]byte, 1024, 1024)
}
buf := b.Buffer
blen := len(b.Buffer)
hasended := false
for !hasended {
if b.logger != nil {
b.logger.Println("a start", b.start)
}
offset := b.start - int64(blen)
if offset < 0 {
offset = 0
hasended = true
}
_, err := b.F.Seek(offset, 0)
if err != nil {
hasended = true
}
n, err := b.F.Read(buf)
if b.logger != nil {
b.logger.Println("f n", n, "err", err)
}
if err != nil {
hasended = true
}
buf = buf[:n]
b.start -= int64(n)
if b.logger != nil {
b.logger.Println("g start", b.start)
}
for i := 1; i < len(search); i++ {
if bytes.HasPrefix(buf, search[i:]) {
e := i - len(search)
b.start += int64(e)
buf = buf[e:]
}
}
if b.logger != nil {
b.logger.Println("g start", b.start)
}
if e := bytes.LastIndex(buf, search); e > -1 {
return b.start + int64(e), nil
}
}
return -1, nil
}
main_test.go
package main
import (
"bytes"
"fmt"
"strings"
"testing"
)
func TestOne(t *testing.T) {
type test struct {
search []byte
readLen int
input string
sep []byte
want int64
}
search := []byte("find me")
blockLen := 1024
tests := []test{
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%stail content", search),
want: 0,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf(""),
want: -1,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: strings.Repeat("nop\n", 10000),
want: -1,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search),
want: 1019,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen), search),
want: 1024,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen+10), search),
want: 1034,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), (blockLen*2)+10), search),
want: 2058,
},
test{
search: search,
sep: []byte("\n"),
readLen: blockLen,
input: fmt.Sprintf("%s%s%stail content", bytes.Repeat([]byte(" "), (blockLen*2)+10), search, search),
want: 2065,
},
}
for i, test := range tests {
s, err := bytesSearch([]byte(test.input))
if err != nil {
t.Fatal(err)
}
s.Buffer = make([]byte, test.readLen)
s.Sep = test.sep
got, err := s.Index(test.search)
if err != nil {
t.Fatal(err)
}
if got != test.want {
t.Fatalf("invalid index at %v got %v wanted %v", i, got, test.want)
}
got, err = s.Index2(test.search)
if err != nil {
t.Fatal(err)
}
if got != test.want {
t.Fatalf("invalid index at %v got %v wanted %v", i, got, test.want)
}
}
}
bench_test.go
package main
import (
"bytes"
"fmt"
"testing"
"github.com/mattetti/filebuffer"
)
func BenchmarkIndex(b *testing.B) {
search := []byte("find me")
blockLen := 1024
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search)
input += input
s := tailSearch{}
s.F = filebuffer.New([]byte(input))
s.Buffer = make([]byte, blockLen)
for i := 0; i < b.N; i++ {
s.start = int64(len(input))
_, err := s.Index(search)
if err != nil {
b.Fatal(err)
}
}
}
func BenchmarkIndex2(b *testing.B) {
search := []byte("find me")
blockLen := 1024
input := fmt.Sprintf("%s%stail content", bytes.Repeat([]byte(" "), blockLen-5), search)
input += input
s := tailSearch{}
s.F = filebuffer.New([]byte(input))
s.Buffer = make([]byte, blockLen)
for i := 0; i < b.N; i++ {
s.start = int64(len(input))
_, err := s.Index2(search)
if err != nil {
b.Fatal(err)
}
}
}
testing
$ go test -v
=== RUN TestOne
--- PASS: TestOne (0.00s)
PASS
ok test/backwardsearch 0.002s
$ go test -bench=. -benchmem -v
=== RUN TestOne
--- PASS: TestOne (0.00s)
goos: linux
goarch: amd64
pkg: test/backwardsearch
BenchmarkIndex-4 20000000 108 ns/op 0 B/op 0 allocs/op
BenchmarkIndex2-4 10000000 167 ns/op 0 B/op 0 allocs/op
PASS
ok test/backwardsearch 4.129s
$ echo "rrrrfindme" | go run main.go -v
2019/10/17 12:17:04 a start 11
2019/10/17 12:17:04 f n 11 err <nil>
2019/10/17 12:17:04 g start 0
Index 4
2019/10/17 12:17:04 a start 11
2019/10/17 12:17:04 f n 11 err <nil>
2019/10/17 12:17:04 g start 0
2019/10/17 12:17:04 g start 0
Index 4
$ cat bench_test.go | go run main.go -search main
Index 8
Index 8
$ go run main.go
Index 2056
Index 2056