@@ -583,7 +583,38 @@ impl Extractor {
583
583
fn union ( & self , mut seq1 : Seq , seq2 : & mut Seq ) -> Seq {
584
584
if seq1. max_union_len ( seq2) . map_or ( false , |len| len > self . limit_total )
585
585
{
586
- seq2. make_infinite ( ) ;
586
+ // We try to trim our literal sequences to see if we can make
587
+ // room for more literals. The idea is that we'd rather trim down
588
+ // literals already in our sequence if it means we can add a few
589
+ // more and retain a finite sequence. Otherwise, we'll union with
590
+ // an infinite sequence and that infects everything and effectively
591
+ // stops literal extraction in its tracks.
592
+ //
593
+ // We do we keep 4 bytes here? Well, it's a bit of an abstraction
594
+ // leakage. Downstream, the literals may wind up getting fed to
595
+ // the Teddy algorithm, which supports searching literals up to
596
+ // length 4. So that's why we pick that number here. Arguably this
597
+ // should be a tuneable parameter, but it seems a little tricky to
598
+ // describe. And I'm still unsure if this is the right way to go
599
+ // about culling literal sequences.
600
+ match self . kind {
601
+ ExtractKind :: Prefix => {
602
+ seq1. keep_first_bytes ( 4 ) ;
603
+ seq2. keep_first_bytes ( 4 ) ;
604
+ }
605
+ ExtractKind :: Suffix => {
606
+ seq1. keep_last_bytes ( 4 ) ;
607
+ seq2. keep_last_bytes ( 4 ) ;
608
+ }
609
+ }
610
+ seq1. dedup ( ) ;
611
+ seq2. dedup ( ) ;
612
+ if seq1
613
+ . max_union_len ( seq2)
614
+ . map_or ( false , |len| len > self . limit_total )
615
+ {
616
+ seq2. make_infinite ( ) ;
617
+ }
587
618
}
588
619
seq1. union ( seq2) ;
589
620
assert ! ( seq1. len( ) . map_or( true , |x| x <= self . limit_total) ) ;
@@ -2763,8 +2794,27 @@ mod tests {
2763
2794
assert_eq ! ( expected, ( prefixes, suffixes) ) ;
2764
2795
}
2765
2796
2797
+ // This tests that we get some kind of literals extracted for a beefier
2798
+ // alternation with case insensitive mode enabled. At one point during
2799
+ // development, this returned nothing, and motivated some special case
2800
+ // code in Extractor::union to try and trim down the literal sequences
2801
+ // if the union would blow the limits set.
2802
+ #[ test]
2803
+ #[ cfg( feature = "unicode-case" ) ]
2804
+ fn holmes_alt ( ) {
2805
+ let mut pre =
2806
+ prefixes ( r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker" ) ;
2807
+ assert ! ( pre. len( ) . unwrap( ) > 0 ) ;
2808
+ pre. optimize_for_prefix_by_preference ( ) ;
2809
+ assert ! ( pre. len( ) . unwrap( ) > 0 ) ;
2810
+ }
2811
+
2766
2812
// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
2767
2813
// See: CVE-2022-24713
2814
+ //
2815
+ // We test this here to ensure literal extraction completes in reasonable
2816
+ // time and isn't materially impacted by these sorts of pathological
2817
+ // repeats.
2768
2818
#[ test]
2769
2819
fn crazy_repeats ( ) {
2770
2820
assert_eq ! ( inexact( [ I ( "" ) ] , [ I ( "" ) ] ) , e( r"(?:){4294967295}" ) ) ;
@@ -3079,7 +3129,7 @@ mod tests {
3079
3129
// literal optimizations.
3080
3130
let ( prefixes, suffixes) = e ( pat) ;
3081
3131
assert ! ( !suffixes. is_finite( ) ) ;
3082
- assert_eq ! ( Some ( 247 ) , prefixes. len( ) ) ;
3132
+ assert_eq ! ( Some ( 243 ) , prefixes. len( ) ) ;
3083
3133
}
3084
3134
3085
3135
#[ test]
0 commit comments