this is not an answer, but, little insight as to anyone else having to deal with such things. may that help you.
I slightly revised stuff here and their, then gave a shot to a Godel-inspired scheme
as described at https://stackoverflow.com/a/396077/11892070
-- main.go --
package main
import (
"fmt"
"sort"
"strings"
)
func main() {
input := []string{"tan", "nat", "⌘", "日本語", "語日本"}
freq := map[rune]int{}
for _, word := range input {
x, err := hashWord(word, freq)
fmt.Println(word, "=>", x, "err=", err)
}
}
func groupAnagramsUsingSort(s []string, tmp map[string][]string, out [][]string) [][]string {
for k := range tmp {
delete(tmp, k)
}
for i := 0; i < len(out); i++ {
out[i] = out[i][:0]
}
out = out[:0]
for _, v := range s {
x := strings.Split(v, "")
sort.Strings(x)
anagram := strings.Join(x, "")
items, ok := tmp[anagram]
if ok {
items = append(items, v)
tmp[anagram] = items
continue
}
tmp[anagram] = []string{v}
}
for key := range tmp {
out = append(out, tmp[key])
}
return out
}
func groupAnagramsUsingHash(s []string, tmp map[int][]string, out [][]string) [][]string {
for k := range tmp {
delete(tmp, k)
}
for i := 0; i < len(out); i++ {
out[i] = out[i][:0]
}
out = out[:0]
freq := map[rune]int{}
for _, v := range s {
hash, _ := hashWord(v, freq)
items, ok := tmp[hash]
if ok {
items = append(items, v)
tmp[hash] = items
continue
}
tmp[hash] = []string{v}
}
for key := range tmp {
out = append(out, tmp[key])
}
return out
}
var primes = []int{2, 41, 37, 47, 3, 67, 71, 23, 5, 101, 61, 17, 19, 13, 31, 43, 97, 29, 11, 7, 73, 83, 79, 89, 59, 53}
var ErrNonASCII = fmt.Errorf("non ascii letter detected")
func getFrequencyMap(word string, freq map[rune]int) (map[rune]int, error) {
for k := range freq {
delete(freq, k)
}
for _, r := range word {
if r-97 < 0 || int(r-97) > len(primes) {
return nil, ErrNonASCII
}
x := freq[r]
freq[r] = x + 1
}
return freq, nil
}
func hashWord(word string, freq map[rune]int) (int, error) {
var err error
freq, err = getFrequencyMap(word, freq)
if err != nil {
return -1, err
}
product := 1
for letter, r := range freq {
product = product * primes[letter-97]
for e := 1; e < r; e++ {
product = product * product
}
}
return product, nil
}
-- main_test.go --
package main
import (
"reflect"
"sort"
"testing"
)
type expectation struct {
input []string
want [][]string
}
var expectations = []expectation{
expectation{
input: []string{"eat", "tea", "tan", "ate", "nat", "bat"},
want: [][]string{
[]string{"ate", "eat", "tea"},
[]string{"bat"},
[]string{"nat", "tan"},
},
},
expectation{
input: []string{"eaft", "tea", "taen", "ate", "nate", "batf"},
want: [][]string{
[]string{"batf"},
[]string{"eaft"},
[]string{"tea", "ate"},
[]string{"taen", "nate"},
},
},
expectation{
input: []string{""},
want: [][]string{
[]string{""},
},
},
expectation{
input: []string{"a"},
want: [][]string{
[]string{"a"},
},
},
}
func TestUsingSort(t *testing.T) {
tmp := map[string][]string{}
out := [][]string{}
for _, expectation := range expectations {
out = groupAnagramsUsingSort(expectation.input, tmp, out)
if len(out) != len(expectation.want) {
t.Fatalf("unexpected output,\nwanted=%#v\ngot =%#v\n", expectation.want, out)
}
for i := 0; i < len(out); i++ {
sort.Strings(out[i])
sort.Strings(expectation.want[i])
}
sort.Slice(out, func(i int, j int) bool {
return len(out[i]) < len(out[j])
})
sort.Slice(expectation.want, func(i int, j int) bool {
return len(expectation.want[i]) < len(expectation.want[j])
})
sort.Slice(out, func(i int, j int) bool {
return (len(out[i]) > 0 &&
len(out[j]) > 0 &&
out[i][0] < out[j][0])
})
sort.Slice(expectation.want, func(i int, j int) bool {
return (len(expectation.want[i]) > 0 &&
len(expectation.want[j]) > 0 &&
expectation.want[i][0] < expectation.want[j][0])
})
for i := 0; i < len(out); i++ {
if !reflect.DeepEqual(out[i], expectation.want[i]) {
t.Fatalf("unexpected output,\nwanted=%#v\ngot =%#v\n", expectation.want, out)
}
}
}
}
func TestUsingHash(t *testing.T) {
tmp := map[int][]string{}
out := [][]string{}
for _, expectation := range expectations {
out = groupAnagramsUsingHash(expectation.input, tmp, out)
if len(out) != len(expectation.want) {
t.Fatalf("unexpected output,\nwanted=%#v\ngot =%#v\n", expectation.want, out)
}
for i := 0; i < len(out); i++ {
sort.Strings(out[i])
sort.Strings(expectation.want[i])
}
sort.Slice(out, func(i int, j int) bool {
return len(out[i]) < len(out[j])
})
sort.Slice(expectation.want, func(i int, j int) bool {
return len(expectation.want[i]) < len(expectation.want[j])
})
sort.Slice(out, func(i int, j int) bool {
return (len(out[i]) > 0 &&
len(out[j]) > 0 &&
out[i][0] < out[j][0])
})
sort.Slice(expectation.want, func(i int, j int) bool {
return (len(expectation.want[i]) > 0 &&
len(expectation.want[j]) > 0 &&
expectation.want[i][0] < expectation.want[j][0])
})
for i := 0; i < len(out); i++ {
if !reflect.DeepEqual(out[i], expectation.want[i]) {
t.Fatalf("unexpected output,\nwanted=%#v\ngot =%#v\n", expectation.want, out)
}
}
}
}
func BenchmarkUsingSort(b *testing.B) {
tmp := map[string][]string{}
out := [][]string{}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, expectation := range expectations {
out = groupAnagramsUsingSort(expectation.input, tmp, out)
_ = out
}
}
}
func BenchmarkUsingHash(b *testing.B) {
tmp := map[int][]string{}
out := [][]string{}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, expectation := range expectations {
out = groupAnagramsUsingHash(expectation.input, tmp, out)
_ = out
}
}
}
Benchmark result
$ go test -bench=. -v .
=== RUN TestUsingSort
--- PASS: TestUsingSort (0.00s)
=== RUN TestUsingHash
--- PASS: TestUsingHash (0.00s)
goos: linux
goarch: amd64
BenchmarkUsingSort
BenchmarkUsingSort-4 344438 3315 ns/op 787 B/op 29 allocs/op
BenchmarkUsingHash
BenchmarkUsingHash-4 410810 2911 ns/op 496 B/op 17 allocs/op
PASS
ok _/home/clementauger/tmp 2.408s