MPH domian matcher: Support building & using cache directly (instead of building from geosite.dat when Xray starts) (#5505)

Like https://github.com/XTLS/Xray-core/pull/5488#issuecomment-3710995080
2026-07-05 19:28:45 +00:00 · 2026-01-31 16:45:39 +03:30
parent afcfdbca70
commit 61e1153157
18 changed files with 988 additions and 161 deletions
@@ -24,6 +24,8 @@ const (
 	XUDPBaseKey          = "xray.xudp.basekey"

 	TunFdKey = "xray.tun.fd"
+
+	MphCachePath = "xray.mph.cache"
 )

 type EnvFlag struct {
@@ -7,8 +7,8 @@ import (
 const validCharCount = 53

 type MatchType struct {
-	matchType Type
-	exist     bool
+	Type  Type
+	Exist bool
 }

 const (
@@ -17,23 +17,23 @@ const (
 )

 type Edge struct {
-	edgeType bool
-	nextNode int
+	Type     bool
+	NextNode int
 }

 type ACAutomaton struct {
-	trie   [][validCharCount]Edge
-	fail   []int
-	exists []MatchType
-	count  int
+	Trie   [][validCharCount]Edge
+	Fail   []int
+	Exists []MatchType
+	Count  int
 }

 func newNode() [validCharCount]Edge {
 	var s [validCharCount]Edge
 	for i := range s {
 		s[i] = Edge{
-			edgeType: FailEdge,
-			nextNode: 0,
+			Type:     FailEdge,
+			NextNode: 0,
 		}
 	}
 	return s
@@ -123,11 +123,11 @@ var char2Index = []int{

 func NewACAutomaton() *ACAutomaton {
 	ac := new(ACAutomaton)
-	ac.trie = append(ac.trie, newNode())
-	ac.fail = append(ac.fail, 0)
-	ac.exists = append(ac.exists, MatchType{
-		matchType: Full,
-		exist:     false,
+	ac.Trie = append(ac.Trie, newNode())
+	ac.Fail = append(ac.Fail, 0)
+	ac.Exists = append(ac.Exists, MatchType{
+		Type:  Full,
+		Exist: false,
 	})
 	return ac
 }
@@ -136,53 +136,53 @@ func (ac *ACAutomaton) Add(domain string, t Type) {
 	node := 0
 	for i := len(domain) - 1; i >= 0; i-- {
 		idx := char2Index[domain[i]]
-		if ac.trie[node][idx].nextNode == 0 {
-			ac.count++
-			if len(ac.trie) < ac.count+1 {
-				ac.trie = append(ac.trie, newNode())
-				ac.fail = append(ac.fail, 0)
-				ac.exists = append(ac.exists, MatchType{
-					matchType: Full,
-					exist:     false,
+		if ac.Trie[node][idx].NextNode == 0 {
+			ac.Count++
+			if len(ac.Trie) < ac.Count+1 {
+				ac.Trie = append(ac.Trie, newNode())
+				ac.Fail = append(ac.Fail, 0)
+				ac.Exists = append(ac.Exists, MatchType{
+					Type:  Full,
+					Exist: false,
 				})
 			}
-			ac.trie[node][idx] = Edge{
-				edgeType: TrieEdge,
-				nextNode: ac.count,
+			ac.Trie[node][idx] = Edge{
+				Type:     TrieEdge,
+				NextNode: ac.Count,
 			}
 		}
-		node = ac.trie[node][idx].nextNode
+		node = ac.Trie[node][idx].NextNode
 	}
-	ac.exists[node] = MatchType{
-		matchType: t,
-		exist:     true,
+	ac.Exists[node] = MatchType{
+		Type:  t,
+		Exist: true,
 	}
 	switch t {
 	case Domain:
-		ac.exists[node] = MatchType{
-			matchType: Full,
-			exist:     true,
+		ac.Exists[node] = MatchType{
+			Type:  Full,
+			Exist: true,
 		}
 		idx := char2Index['.']
-		if ac.trie[node][idx].nextNode == 0 {
-			ac.count++
-			if len(ac.trie) < ac.count+1 {
-				ac.trie = append(ac.trie, newNode())
-				ac.fail = append(ac.fail, 0)
-				ac.exists = append(ac.exists, MatchType{
-					matchType: Full,
-					exist:     false,
+		if ac.Trie[node][idx].NextNode == 0 {
+			ac.Count++
+			if len(ac.Trie) < ac.Count+1 {
+				ac.Trie = append(ac.Trie, newNode())
+				ac.Fail = append(ac.Fail, 0)
+				ac.Exists = append(ac.Exists, MatchType{
+					Type:  Full,
+					Exist: false,
 				})
 			}
-			ac.trie[node][idx] = Edge{
-				edgeType: TrieEdge,
-				nextNode: ac.count,
+			ac.Trie[node][idx] = Edge{
+				Type:     TrieEdge,
+				NextNode: ac.Count,
 			}
 		}
-		node = ac.trie[node][idx].nextNode
-		ac.exists[node] = MatchType{
-			matchType: t,
-			exist:     true,
+		node = ac.Trie[node][idx].NextNode
+		ac.Exists[node] = MatchType{
+			Type:  t,
+			Exist: true,
 		}
 	default:
 		break
@@ -192,8 +192,8 @@ func (ac *ACAutomaton) Add(domain string, t Type) {
 func (ac *ACAutomaton) Build() {
 	queue := list.New()
 	for i := 0; i < validCharCount; i++ {
-		if ac.trie[0][i].nextNode != 0 {
-			queue.PushBack(ac.trie[0][i])
+		if ac.Trie[0][i].NextNode != 0 {
+			queue.PushBack(ac.Trie[0][i])
 		}
 	}
 	for {
@@ -201,16 +201,16 @@ func (ac *ACAutomaton) Build() {
 		if front == nil {
 			break
 		} else {
-			node := front.Value.(Edge).nextNode
+			node := front.Value.(Edge).NextNode
 			queue.Remove(front)
 			for i := 0; i < validCharCount; i++ {
-				if ac.trie[node][i].nextNode != 0 {
-					ac.fail[ac.trie[node][i].nextNode] = ac.trie[ac.fail[node]][i].nextNode
-					queue.PushBack(ac.trie[node][i])
+				if ac.Trie[node][i].NextNode != 0 {
+					ac.Fail[ac.Trie[node][i].NextNode] = ac.Trie[ac.Fail[node]][i].NextNode
+					queue.PushBack(ac.Trie[node][i])
 				} else {
-					ac.trie[node][i] = Edge{
-						edgeType: FailEdge,
-						nextNode: ac.trie[ac.fail[node]][i].nextNode,
+					ac.Trie[node][i] = Edge{
+						Type:     FailEdge,
+						NextNode: ac.Trie[ac.Fail[node]][i].NextNode,
 					}
 				}
 			}
@@ -230,9 +230,9 @@ func (ac *ACAutomaton) Match(s string) bool {
 			return false
 		}
 		idx := char2Index[chr]
-		fullMatch = fullMatch && ac.trie[node][idx].edgeType
-		node = ac.trie[node][idx].nextNode
-		switch ac.exists[node].matchType {
+		fullMatch = fullMatch && ac.Trie[node][idx].Type
+		node = ac.Trie[node][idx].NextNode
+		switch ac.Exists[node].Type {
 		case Substr:
 			return true
 		case Domain:
@@ -243,5 +243,5 @@ func (ac *ACAutomaton) Match(s string) bool {
 			break
 		}
 	}
-	return fullMatch && ac.exists[node].exist
+	return fullMatch && ac.Exists[node].Exist
 }
@@ -39,14 +39,18 @@ func (m domainMatcher) String() string {
 	return "domain:" + string(m)
 }

-type regexMatcher struct {
-	pattern *regexp.Regexp
+type RegexMatcher struct {
+	Pattern string
+	reg     *regexp.Regexp
 }

-func (m *regexMatcher) Match(s string) bool {
-	return m.pattern.MatchString(s)
+func (m *RegexMatcher) Match(s string) bool {
+	if m.reg == nil {
+		m.reg = regexp.MustCompile(m.Pattern)
+	}
+	return m.reg.MatchString(s)
 }

-func (m *regexMatcher) String() string {
-	return "regexp:" + m.pattern.String()
+func (m *RegexMatcher) String() string {
+	return "regexp:" + m.Pattern
 }
@@ -25,40 +25,40 @@ func RollingHash(s string) uint32 {
 // 2. `substr` patterns are matched by ac automaton;
 // 3. `regex` patterns are matched with the regex library.
 type MphMatcherGroup struct {
-	ac            *ACAutomaton
-	otherMatchers []matcherEntry
-	rules         []string
-	level0        []uint32
-	level0Mask    int
-	level1        []uint32
-	level1Mask    int
-	count         uint32
-	ruleMap       *map[string]uint32
+	Ac            *ACAutomaton
+	OtherMatchers []MatcherEntry
+	Rules         []string
+	Level0        []uint32
+	Level0Mask    int
+	Level1        []uint32
+	Level1Mask    int
+	Count         uint32
+	RuleMap       *map[string]uint32
 }

 func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
 	h := RollingHash(pattern)
 	switch t {
 	case Domain:
-		(*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
+		(*g.RuleMap)["."+pattern] = h*PrimeRK + uint32('.')
 		fallthrough
 	case Full:
-		(*g.ruleMap)[pattern] = h
+		(*g.RuleMap)[pattern] = h
 	default:
 	}
 }

 func NewMphMatcherGroup() *MphMatcherGroup {
 	return &MphMatcherGroup{
-		ac:            nil,
-		otherMatchers: nil,
-		rules:         nil,
-		level0:        nil,
-		level0Mask:    0,
-		level1:        nil,
-		level1Mask:    0,
-		count:         1,
-		ruleMap:       &map[string]uint32{},
+		Ac:            nil,
+		OtherMatchers: nil,
+		Rules:         nil,
+		Level0:        nil,
+		Level0Mask:    0,
+		Level1:        nil,
+		Level1Mask:    0,
+		Count:         1,
+		RuleMap:       &map[string]uint32{},
 	}
 }

@@ -66,10 +66,10 @@ func NewMphMatcherGroup() *MphMatcherGroup {
 func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
 	switch t {
 	case Substr:
-		if g.ac == nil {
-			g.ac = NewACAutomaton()
+		if g.Ac == nil {
+			g.Ac = NewACAutomaton()
 		}
-		g.ac.Add(pattern, t)
+		g.Ac.Add(pattern, t)
 	case Full, Domain:
 		pattern = strings.ToLower(pattern)
 		g.AddFullOrDomainPattern(pattern, t)
@@ -78,39 +78,39 @@ func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
 		if err != nil {
 			return 0, err
 		}
-		g.otherMatchers = append(g.otherMatchers, matcherEntry{
-			m:  &regexMatcher{pattern: r},
-			id: g.count,
+		g.OtherMatchers = append(g.OtherMatchers, MatcherEntry{
+			M:  &RegexMatcher{Pattern: pattern, reg: r},
+			Id: g.Count,
 		})
 	default:
 		panic("Unknown type")
 	}
-	return g.count, nil
+	return g.Count, nil
 }

 // Build builds a minimal perfect hash table and ac automaton from insert rules
 func (g *MphMatcherGroup) Build() {
-	if g.ac != nil {
-		g.ac.Build()
+	if g.Ac != nil {
+		g.Ac.Build()
 	}
-	keyLen := len(*g.ruleMap)
+	keyLen := len(*g.RuleMap)
 	if keyLen == 0 {
 		keyLen = 1
-		(*g.ruleMap)["empty___"] = RollingHash("empty___")
+		(*g.RuleMap)["empty___"] = RollingHash("empty___")
 	}
-	g.level0 = make([]uint32, nextPow2(keyLen/4))
-	g.level0Mask = len(g.level0) - 1
-	g.level1 = make([]uint32, nextPow2(keyLen))
-	g.level1Mask = len(g.level1) - 1
-	sparseBuckets := make([][]int, len(g.level0))
+	g.Level0 = make([]uint32, nextPow2(keyLen/4))
+	g.Level0Mask = len(g.Level0) - 1
+	g.Level1 = make([]uint32, nextPow2(keyLen))
+	g.Level1Mask = len(g.Level1) - 1
+	sparseBuckets := make([][]int, len(g.Level0))
 	var ruleIdx int
-	for rule, hash := range *g.ruleMap {
-		n := int(hash) & g.level0Mask
-		g.rules = append(g.rules, rule)
+	for rule, hash := range *g.RuleMap {
+		n := int(hash) & g.Level0Mask
+		g.Rules = append(g.Rules, rule)
 		sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
 		ruleIdx++
 	}
-	g.ruleMap = nil
+	g.RuleMap = nil
 	var buckets []indexBucket
 	for n, vals := range sparseBuckets {
 		if len(vals) > 0 {
@@ -119,7 +119,7 @@ func (g *MphMatcherGroup) Build() {
 	}
 	sort.Sort(bySize(buckets))

-	occ := make([]bool, len(g.level1))
+	occ := make([]bool, len(g.Level1))
 	var tmpOcc []int
 	for _, bucket := range buckets {
 		seed := uint32(0)
@@ -127,7 +127,7 @@ func (g *MphMatcherGroup) Build() {
 			findSeed := true
 			tmpOcc = tmpOcc[:0]
 			for _, i := range bucket.vals {
-				n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
+				n := int(strhashFallback(unsafe.Pointer(&g.Rules[i]), uintptr(seed))) & g.Level1Mask
 				if occ[n] {
 					for _, n := range tmpOcc {
 						occ[n] = false
@@ -138,10 +138,10 @@ func (g *MphMatcherGroup) Build() {
 				}
 				occ[n] = true
 				tmpOcc = append(tmpOcc, n)
-				g.level1[n] = uint32(i)
+				g.Level1[n] = uint32(i)
 			}
 			if findSeed {
-				g.level0[bucket.n] = seed
+				g.Level0[bucket.n] = seed
 				break
 			}
 		}
@@ -159,11 +159,11 @@ func nextPow2(v int) int {

 // Lookup searches for s in t and returns its index and whether it was found.
 func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
-	i0 := int(h) & g.level0Mask
-	seed := g.level0[i0]
-	i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
-	n := g.level1[i1]
-	return s == g.rules[int(n)]
+	i0 := int(h) & g.Level0Mask
+	seed := g.Level0[i0]
+	i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.Level1Mask
+	n := g.Level1[i1]
+	return s == g.Rules[int(n)]
 }

 // Match implements IndexMatcher.Match.
@@ -183,13 +183,13 @@ func (g *MphMatcherGroup) Match(pattern string) []uint32 {
 		result = append(result, 1)
 		return result
 	}
-	if g.ac != nil && g.ac.Match(pattern) {
+	if g.Ac != nil && g.Ac.Match(pattern) {
 		result = append(result, 1)
 		return result
 	}
-	for _, e := range g.otherMatchers {
-		if e.m.Match(pattern) {
-			result = append(result, e.id)
+	for _, e := range g.OtherMatchers {
+		if e.M.Match(pattern) {
+			result = append(result, e.Id)
 			return result
 		}
 	}
@@ -302,3 +302,7 @@ func readUnaligned64(p unsafe.Pointer) uint64 {
 	q := (*[8]byte)(p)
 	return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
 }
+
+func (g *MphMatcherGroup) Size() uint32 {
+	return g.Count
+}
@@ -0,0 +1,47 @@
+package strmatcher
+
+import (
+	"bytes"
+	"encoding/gob"
+	"io"
+)
+
+func init() {
+	gob.Register(&RegexMatcher{})
+	gob.Register(fullMatcher(""))
+	gob.Register(substrMatcher(""))
+	gob.Register(domainMatcher(""))
+}
+
+func (g *MphMatcherGroup) Serialize(w io.Writer) error {
+	data := MphMatcherGroup{
+		Ac:            g.Ac,
+		OtherMatchers: g.OtherMatchers,
+		Rules:         g.Rules,
+		Level0:        g.Level0,
+		Level0Mask:    g.Level0Mask,
+		Level1:        g.Level1,
+		Level1Mask:    g.Level1Mask,
+		Count:         g.Count,
+	}
+	return gob.NewEncoder(w).Encode(data)
+}
+
+func NewMphMatcherGroupFromBuffer(data []byte) (*MphMatcherGroup, error) {
+	var gData MphMatcherGroup
+	if err := gob.NewDecoder(bytes.NewReader(data)).Decode(&gData); err != nil {
+		return nil, err
+	}
+
+	g := NewMphMatcherGroup()
+	g.Ac = gData.Ac
+	g.OtherMatchers = gData.OtherMatchers
+	g.Rules = gData.Rules
+	g.Level0 = gData.Level0
+	g.Level0Mask = gData.Level0Mask
+	g.Level1 = gData.Level1
+	g.Level1Mask = gData.Level1Mask
+	g.Count = gData.Count
+
+	return g, nil
+}
@@ -41,8 +41,9 @@ func (t Type) New(pattern string) (Matcher, error) {
 		if err != nil {
 			return nil, err
 		}
-		return &regexMatcher{
-			pattern: r,
+		return &RegexMatcher{
+			Pattern: pattern,
+			reg:     r,
 		}, nil
 	default:
 		return nil, errors.New("unk type")
@@ -53,11 +54,13 @@ func (t Type) New(pattern string) (Matcher, error) {
 type IndexMatcher interface {
 	// Match returns the index of a matcher that matches the input. It returns empty array if no such matcher exists.
 	Match(input string) []uint32
+	// Size returns the number of matchers in the group.
+	Size() uint32
 }

-type matcherEntry struct {
-	m  Matcher
-	id uint32
+type MatcherEntry struct {
+	M  Matcher
+	Id uint32
 }

 // MatcherGroup is an implementation of IndexMatcher.
@@ -66,7 +69,7 @@ type MatcherGroup struct {
 	count         uint32
 	fullMatcher   FullMatcherGroup
 	domainMatcher DomainMatcherGroup
-	otherMatchers []matcherEntry
+	otherMatchers []MatcherEntry
 }

 // Add adds a new Matcher into the MatcherGroup, and returns its index. The index will never be 0.
@@ -80,9 +83,9 @@ func (g *MatcherGroup) Add(m Matcher) uint32 {
 	case domainMatcher:
 		g.domainMatcher.addMatcher(tm, c)
 	default:
-		g.otherMatchers = append(g.otherMatchers, matcherEntry{
-			m:  m,
-			id: c,
+		g.otherMatchers = append(g.otherMatchers, MatcherEntry{
+			M:  m,
+			Id: c,
 		})
 	}

@@ -95,8 +98,8 @@ func (g *MatcherGroup) Match(pattern string) []uint32 {
 	result = append(result, g.fullMatcher.Match(pattern)...)
 	result = append(result, g.domainMatcher.Match(pattern)...)
 	for _, e := range g.otherMatchers {
-		if e.m.Match(pattern) {
-			result = append(result, e.id)
+		if e.M.Match(pattern) {
+			result = append(result, e.Id)
 		}
 	}
 	return result
@@ -106,3 +109,33 @@ func (g *MatcherGroup) Match(pattern string) []uint32 {
 func (g *MatcherGroup) Size() uint32 {
 	return g.count
 }
+
+type IndexMatcherGroup struct {
+	Matchers []IndexMatcher
+}
+
+func (g *IndexMatcherGroup) Match(input string) []uint32 {
+	var offset uint32
+	for _, m := range g.Matchers {
+		if res := m.Match(input); len(res) > 0 {
+			if offset == 0 {
+				return res
+			}
+			shifted := make([]uint32, len(res))
+			for i, id := range res {
+				shifted[i] = id + offset
+			}
+			return shifted
+		}
+		offset += m.Size()
+	}
+	return nil
+}
+
+func (g *IndexMatcherGroup) Size() uint32 {
+	var count uint32
+	for _, m := range g.Matchers {
+		count += m.Size()
+	}
+	return count
+}