MPH domian matcher: Support building & using cache directly (instead of building from geosite.dat when Xray starts) (#5505)

Like https://github.com/XTLS/Xray-core/pull/5488#issuecomment-3710995080
This commit is contained in:
Hossin Asaadi
2026-01-31 16:45:39 +03:30
committed by RPRX
parent afcfdbca70
commit 61e1153157
18 changed files with 988 additions and 161 deletions
+2
View File
@@ -24,6 +24,8 @@ const (
XUDPBaseKey = "xray.xudp.basekey"
TunFdKey = "xray.tun.fd"
MphCachePath = "xray.mph.cache"
)
type EnvFlag struct {
+61 -61
View File
@@ -7,8 +7,8 @@ import (
const validCharCount = 53
type MatchType struct {
matchType Type
exist bool
Type Type
Exist bool
}
const (
@@ -17,23 +17,23 @@ const (
)
type Edge struct {
edgeType bool
nextNode int
Type bool
NextNode int
}
type ACAutomaton struct {
trie [][validCharCount]Edge
fail []int
exists []MatchType
count int
Trie [][validCharCount]Edge
Fail []int
Exists []MatchType
Count int
}
func newNode() [validCharCount]Edge {
var s [validCharCount]Edge
for i := range s {
s[i] = Edge{
edgeType: FailEdge,
nextNode: 0,
Type: FailEdge,
NextNode: 0,
}
}
return s
@@ -123,11 +123,11 @@ var char2Index = []int{
func NewACAutomaton() *ACAutomaton {
ac := new(ACAutomaton)
ac.trie = append(ac.trie, newNode())
ac.fail = append(ac.fail, 0)
ac.exists = append(ac.exists, MatchType{
matchType: Full,
exist: false,
ac.Trie = append(ac.Trie, newNode())
ac.Fail = append(ac.Fail, 0)
ac.Exists = append(ac.Exists, MatchType{
Type: Full,
Exist: false,
})
return ac
}
@@ -136,53 +136,53 @@ func (ac *ACAutomaton) Add(domain string, t Type) {
node := 0
for i := len(domain) - 1; i >= 0; i-- {
idx := char2Index[domain[i]]
if ac.trie[node][idx].nextNode == 0 {
ac.count++
if len(ac.trie) < ac.count+1 {
ac.trie = append(ac.trie, newNode())
ac.fail = append(ac.fail, 0)
ac.exists = append(ac.exists, MatchType{
matchType: Full,
exist: false,
if ac.Trie[node][idx].NextNode == 0 {
ac.Count++
if len(ac.Trie) < ac.Count+1 {
ac.Trie = append(ac.Trie, newNode())
ac.Fail = append(ac.Fail, 0)
ac.Exists = append(ac.Exists, MatchType{
Type: Full,
Exist: false,
})
}
ac.trie[node][idx] = Edge{
edgeType: TrieEdge,
nextNode: ac.count,
ac.Trie[node][idx] = Edge{
Type: TrieEdge,
NextNode: ac.Count,
}
}
node = ac.trie[node][idx].nextNode
node = ac.Trie[node][idx].NextNode
}
ac.exists[node] = MatchType{
matchType: t,
exist: true,
ac.Exists[node] = MatchType{
Type: t,
Exist: true,
}
switch t {
case Domain:
ac.exists[node] = MatchType{
matchType: Full,
exist: true,
ac.Exists[node] = MatchType{
Type: Full,
Exist: true,
}
idx := char2Index['.']
if ac.trie[node][idx].nextNode == 0 {
ac.count++
if len(ac.trie) < ac.count+1 {
ac.trie = append(ac.trie, newNode())
ac.fail = append(ac.fail, 0)
ac.exists = append(ac.exists, MatchType{
matchType: Full,
exist: false,
if ac.Trie[node][idx].NextNode == 0 {
ac.Count++
if len(ac.Trie) < ac.Count+1 {
ac.Trie = append(ac.Trie, newNode())
ac.Fail = append(ac.Fail, 0)
ac.Exists = append(ac.Exists, MatchType{
Type: Full,
Exist: false,
})
}
ac.trie[node][idx] = Edge{
edgeType: TrieEdge,
nextNode: ac.count,
ac.Trie[node][idx] = Edge{
Type: TrieEdge,
NextNode: ac.Count,
}
}
node = ac.trie[node][idx].nextNode
ac.exists[node] = MatchType{
matchType: t,
exist: true,
node = ac.Trie[node][idx].NextNode
ac.Exists[node] = MatchType{
Type: t,
Exist: true,
}
default:
break
@@ -192,8 +192,8 @@ func (ac *ACAutomaton) Add(domain string, t Type) {
func (ac *ACAutomaton) Build() {
queue := list.New()
for i := 0; i < validCharCount; i++ {
if ac.trie[0][i].nextNode != 0 {
queue.PushBack(ac.trie[0][i])
if ac.Trie[0][i].NextNode != 0 {
queue.PushBack(ac.Trie[0][i])
}
}
for {
@@ -201,16 +201,16 @@ func (ac *ACAutomaton) Build() {
if front == nil {
break
} else {
node := front.Value.(Edge).nextNode
node := front.Value.(Edge).NextNode
queue.Remove(front)
for i := 0; i < validCharCount; i++ {
if ac.trie[node][i].nextNode != 0 {
ac.fail[ac.trie[node][i].nextNode] = ac.trie[ac.fail[node]][i].nextNode
queue.PushBack(ac.trie[node][i])
if ac.Trie[node][i].NextNode != 0 {
ac.Fail[ac.Trie[node][i].NextNode] = ac.Trie[ac.Fail[node]][i].NextNode
queue.PushBack(ac.Trie[node][i])
} else {
ac.trie[node][i] = Edge{
edgeType: FailEdge,
nextNode: ac.trie[ac.fail[node]][i].nextNode,
ac.Trie[node][i] = Edge{
Type: FailEdge,
NextNode: ac.Trie[ac.Fail[node]][i].NextNode,
}
}
}
@@ -230,9 +230,9 @@ func (ac *ACAutomaton) Match(s string) bool {
return false
}
idx := char2Index[chr]
fullMatch = fullMatch && ac.trie[node][idx].edgeType
node = ac.trie[node][idx].nextNode
switch ac.exists[node].matchType {
fullMatch = fullMatch && ac.Trie[node][idx].Type
node = ac.Trie[node][idx].NextNode
switch ac.Exists[node].Type {
case Substr:
return true
case Domain:
@@ -243,5 +243,5 @@ func (ac *ACAutomaton) Match(s string) bool {
break
}
}
return fullMatch && ac.exists[node].exist
return fullMatch && ac.Exists[node].Exist
}
+10 -6
View File
@@ -39,14 +39,18 @@ func (m domainMatcher) String() string {
return "domain:" + string(m)
}
type regexMatcher struct {
pattern *regexp.Regexp
type RegexMatcher struct {
Pattern string
reg *regexp.Regexp
}
func (m *regexMatcher) Match(s string) bool {
return m.pattern.MatchString(s)
func (m *RegexMatcher) Match(s string) bool {
if m.reg == nil {
m.reg = regexp.MustCompile(m.Pattern)
}
return m.reg.MatchString(s)
}
func (m *regexMatcher) String() string {
return "regexp:" + m.pattern.String()
func (m *RegexMatcher) String() string {
return "regexp:" + m.Pattern
}
+57 -53
View File
@@ -25,40 +25,40 @@ func RollingHash(s string) uint32 {
// 2. `substr` patterns are matched by ac automaton;
// 3. `regex` patterns are matched with the regex library.
type MphMatcherGroup struct {
ac *ACAutomaton
otherMatchers []matcherEntry
rules []string
level0 []uint32
level0Mask int
level1 []uint32
level1Mask int
count uint32
ruleMap *map[string]uint32
Ac *ACAutomaton
OtherMatchers []MatcherEntry
Rules []string
Level0 []uint32
Level0Mask int
Level1 []uint32
Level1Mask int
Count uint32
RuleMap *map[string]uint32
}
func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
h := RollingHash(pattern)
switch t {
case Domain:
(*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
(*g.RuleMap)["."+pattern] = h*PrimeRK + uint32('.')
fallthrough
case Full:
(*g.ruleMap)[pattern] = h
(*g.RuleMap)[pattern] = h
default:
}
}
func NewMphMatcherGroup() *MphMatcherGroup {
return &MphMatcherGroup{
ac: nil,
otherMatchers: nil,
rules: nil,
level0: nil,
level0Mask: 0,
level1: nil,
level1Mask: 0,
count: 1,
ruleMap: &map[string]uint32{},
Ac: nil,
OtherMatchers: nil,
Rules: nil,
Level0: nil,
Level0Mask: 0,
Level1: nil,
Level1Mask: 0,
Count: 1,
RuleMap: &map[string]uint32{},
}
}
@@ -66,10 +66,10 @@ func NewMphMatcherGroup() *MphMatcherGroup {
func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
switch t {
case Substr:
if g.ac == nil {
g.ac = NewACAutomaton()
if g.Ac == nil {
g.Ac = NewACAutomaton()
}
g.ac.Add(pattern, t)
g.Ac.Add(pattern, t)
case Full, Domain:
pattern = strings.ToLower(pattern)
g.AddFullOrDomainPattern(pattern, t)
@@ -78,39 +78,39 @@ func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
if err != nil {
return 0, err
}
g.otherMatchers = append(g.otherMatchers, matcherEntry{
m: &regexMatcher{pattern: r},
id: g.count,
g.OtherMatchers = append(g.OtherMatchers, MatcherEntry{
M: &RegexMatcher{Pattern: pattern, reg: r},
Id: g.Count,
})
default:
panic("Unknown type")
}
return g.count, nil
return g.Count, nil
}
// Build builds a minimal perfect hash table and ac automaton from insert rules
func (g *MphMatcherGroup) Build() {
if g.ac != nil {
g.ac.Build()
if g.Ac != nil {
g.Ac.Build()
}
keyLen := len(*g.ruleMap)
keyLen := len(*g.RuleMap)
if keyLen == 0 {
keyLen = 1
(*g.ruleMap)["empty___"] = RollingHash("empty___")
(*g.RuleMap)["empty___"] = RollingHash("empty___")
}
g.level0 = make([]uint32, nextPow2(keyLen/4))
g.level0Mask = len(g.level0) - 1
g.level1 = make([]uint32, nextPow2(keyLen))
g.level1Mask = len(g.level1) - 1
sparseBuckets := make([][]int, len(g.level0))
g.Level0 = make([]uint32, nextPow2(keyLen/4))
g.Level0Mask = len(g.Level0) - 1
g.Level1 = make([]uint32, nextPow2(keyLen))
g.Level1Mask = len(g.Level1) - 1
sparseBuckets := make([][]int, len(g.Level0))
var ruleIdx int
for rule, hash := range *g.ruleMap {
n := int(hash) & g.level0Mask
g.rules = append(g.rules, rule)
for rule, hash := range *g.RuleMap {
n := int(hash) & g.Level0Mask
g.Rules = append(g.Rules, rule)
sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
ruleIdx++
}
g.ruleMap = nil
g.RuleMap = nil
var buckets []indexBucket
for n, vals := range sparseBuckets {
if len(vals) > 0 {
@@ -119,7 +119,7 @@ func (g *MphMatcherGroup) Build() {
}
sort.Sort(bySize(buckets))
occ := make([]bool, len(g.level1))
occ := make([]bool, len(g.Level1))
var tmpOcc []int
for _, bucket := range buckets {
seed := uint32(0)
@@ -127,7 +127,7 @@ func (g *MphMatcherGroup) Build() {
findSeed := true
tmpOcc = tmpOcc[:0]
for _, i := range bucket.vals {
n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
n := int(strhashFallback(unsafe.Pointer(&g.Rules[i]), uintptr(seed))) & g.Level1Mask
if occ[n] {
for _, n := range tmpOcc {
occ[n] = false
@@ -138,10 +138,10 @@ func (g *MphMatcherGroup) Build() {
}
occ[n] = true
tmpOcc = append(tmpOcc, n)
g.level1[n] = uint32(i)
g.Level1[n] = uint32(i)
}
if findSeed {
g.level0[bucket.n] = seed
g.Level0[bucket.n] = seed
break
}
}
@@ -159,11 +159,11 @@ func nextPow2(v int) int {
// Lookup searches for s in t and returns its index and whether it was found.
func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
i0 := int(h) & g.level0Mask
seed := g.level0[i0]
i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
n := g.level1[i1]
return s == g.rules[int(n)]
i0 := int(h) & g.Level0Mask
seed := g.Level0[i0]
i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.Level1Mask
n := g.Level1[i1]
return s == g.Rules[int(n)]
}
// Match implements IndexMatcher.Match.
@@ -183,13 +183,13 @@ func (g *MphMatcherGroup) Match(pattern string) []uint32 {
result = append(result, 1)
return result
}
if g.ac != nil && g.ac.Match(pattern) {
if g.Ac != nil && g.Ac.Match(pattern) {
result = append(result, 1)
return result
}
for _, e := range g.otherMatchers {
if e.m.Match(pattern) {
result = append(result, e.id)
for _, e := range g.OtherMatchers {
if e.M.Match(pattern) {
result = append(result, e.Id)
return result
}
}
@@ -302,3 +302,7 @@ func readUnaligned64(p unsafe.Pointer) uint64 {
q := (*[8]byte)(p)
return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
}
func (g *MphMatcherGroup) Size() uint32 {
return g.Count
}
+47
View File
@@ -0,0 +1,47 @@
package strmatcher
import (
"bytes"
"encoding/gob"
"io"
)
func init() {
gob.Register(&RegexMatcher{})
gob.Register(fullMatcher(""))
gob.Register(substrMatcher(""))
gob.Register(domainMatcher(""))
}
func (g *MphMatcherGroup) Serialize(w io.Writer) error {
data := MphMatcherGroup{
Ac: g.Ac,
OtherMatchers: g.OtherMatchers,
Rules: g.Rules,
Level0: g.Level0,
Level0Mask: g.Level0Mask,
Level1: g.Level1,
Level1Mask: g.Level1Mask,
Count: g.Count,
}
return gob.NewEncoder(w).Encode(data)
}
func NewMphMatcherGroupFromBuffer(data []byte) (*MphMatcherGroup, error) {
var gData MphMatcherGroup
if err := gob.NewDecoder(bytes.NewReader(data)).Decode(&gData); err != nil {
return nil, err
}
g := NewMphMatcherGroup()
g.Ac = gData.Ac
g.OtherMatchers = gData.OtherMatchers
g.Rules = gData.Rules
g.Level0 = gData.Level0
g.Level0Mask = gData.Level0Mask
g.Level1 = gData.Level1
g.Level1Mask = gData.Level1Mask
g.Count = gData.Count
return g, nil
}
+44 -11
View File
@@ -41,8 +41,9 @@ func (t Type) New(pattern string) (Matcher, error) {
if err != nil {
return nil, err
}
return &regexMatcher{
pattern: r,
return &RegexMatcher{
Pattern: pattern,
reg: r,
}, nil
default:
return nil, errors.New("unk type")
@@ -53,11 +54,13 @@ func (t Type) New(pattern string) (Matcher, error) {
type IndexMatcher interface {
// Match returns the index of a matcher that matches the input. It returns empty array if no such matcher exists.
Match(input string) []uint32
// Size returns the number of matchers in the group.
Size() uint32
}
type matcherEntry struct {
m Matcher
id uint32
type MatcherEntry struct {
M Matcher
Id uint32
}
// MatcherGroup is an implementation of IndexMatcher.
@@ -66,7 +69,7 @@ type MatcherGroup struct {
count uint32
fullMatcher FullMatcherGroup
domainMatcher DomainMatcherGroup
otherMatchers []matcherEntry
otherMatchers []MatcherEntry
}
// Add adds a new Matcher into the MatcherGroup, and returns its index. The index will never be 0.
@@ -80,9 +83,9 @@ func (g *MatcherGroup) Add(m Matcher) uint32 {
case domainMatcher:
g.domainMatcher.addMatcher(tm, c)
default:
g.otherMatchers = append(g.otherMatchers, matcherEntry{
m: m,
id: c,
g.otherMatchers = append(g.otherMatchers, MatcherEntry{
M: m,
Id: c,
})
}
@@ -95,8 +98,8 @@ func (g *MatcherGroup) Match(pattern string) []uint32 {
result = append(result, g.fullMatcher.Match(pattern)...)
result = append(result, g.domainMatcher.Match(pattern)...)
for _, e := range g.otherMatchers {
if e.m.Match(pattern) {
result = append(result, e.id)
if e.M.Match(pattern) {
result = append(result, e.Id)
}
}
return result
@@ -106,3 +109,33 @@ func (g *MatcherGroup) Match(pattern string) []uint32 {
func (g *MatcherGroup) Size() uint32 {
return g.count
}
type IndexMatcherGroup struct {
Matchers []IndexMatcher
}
func (g *IndexMatcherGroup) Match(input string) []uint32 {
var offset uint32
for _, m := range g.Matchers {
if res := m.Match(input); len(res) > 0 {
if offset == 0 {
return res
}
shifted := make([]uint32, len(res))
for i, id := range res {
shifted[i] = id + offset
}
return shifted
}
offset += m.Size()
}
return nil
}
func (g *IndexMatcherGroup) Size() uint32 {
var count uint32
for _, m := range g.Matchers {
count += m.Size()
}
return count
}