@@ -101,6 +101,7 @@ enum GraphemeState {
101
101
Regional ,
102
102
Emoji ,
103
103
Zwj ,
104
+ Unknown ,
104
105
}
105
106
106
107
impl < ' a > Iterator for Graphemes < ' a > {
@@ -226,6 +227,7 @@ impl<'a> Iterator for Graphemes<'a> {
226
227
break ;
227
228
}
228
229
} ,
230
+ Unknown => unreachable ! ( ) ,
229
231
}
230
232
}
231
233
@@ -388,7 +390,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
388
390
take_curr = false ;
389
391
break ;
390
392
}
391
- }
393
+ } ,
394
+ Unknown => unreachable ! ( ) ,
392
395
}
393
396
}
394
397
@@ -433,3 +436,248 @@ pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
433
436
pub fn new_grapheme_indices < ' b > ( s : & ' b str , is_extended : bool ) -> GraphemeIndices < ' b > {
434
437
GraphemeIndices { start_offset : s. as_ptr ( ) as usize , iter : new_graphemes ( s, is_extended) }
435
438
}
439
+
440
+ // maybe unify with PairResult?
441
+ #[ derive( PartialEq , Eq ) ]
442
+ enum GraphemeCursorState {
443
+ Unknown ,
444
+ NotBreak ,
445
+ Break ,
446
+ CheckCrlf ,
447
+ Regional ,
448
+ Emoji ,
449
+ }
450
+
451
+ pub struct GraphemeCursor {
452
+ offset : usize , // current cursor position
453
+ len : usize , // total length of the string
454
+ is_extended : bool ,
455
+ state : GraphemeCursorState ,
456
+ cat : Option < GraphemeCat > , // category of codepoint immediately preceding cursor
457
+ catb : Option < GraphemeCat > , // category of codepoint immediately after cursor
458
+ pre_context_offset : Option < usize > ,
459
+ ris_count : Option < usize > ,
460
+ }
461
+
462
+ #[ derive( PartialEq , Eq ) ]
463
+ pub enum GraphemeIncomplete {
464
+ PreContext ( usize ) , // need pre-context for chunk ending at usize
465
+ PrevChunk , // requesting chunk previous to the one given
466
+ NextChunk , // requesting chunk following the one given
467
+ InvalidOffset , // error, chunk given is not inside cursor
468
+ }
469
+
470
+ #[ derive( PartialEq , Eq ) ]
471
+ enum PairResult {
472
+ NotBreak , // definitely not a break
473
+ Break , // definitely a break
474
+ Extended , // a break if in extended mode
475
+ CheckCrlf , // a break unless it's a CR LF pair
476
+ Regional , // a break if preceded by an even number of RIS
477
+ Emoji , // a break if preceded by emoji base and extend
478
+ }
479
+
480
+ fn check_pair ( before : GraphemeCat , after : GraphemeCat ) -> PairResult {
481
+ use self :: PairResult :: * ;
482
+ use tables:: grapheme:: GraphemeCat :: * ;
483
+ match ( before, after) {
484
+ ( GC_Control , GC_Control ) => CheckCrlf , // GB3
485
+ ( GC_Control , _) => Break , // GB4
486
+ ( _, GC_Control ) => Break , // GB5
487
+ ( GC_L , GC_L ) => NotBreak , // GB6
488
+ ( GC_L , GC_V ) => NotBreak , // GB6
489
+ ( GC_L , GC_LV ) => NotBreak , // GB6
490
+ ( GC_L , GC_LVT ) => NotBreak , // GB6
491
+ ( GC_LV , GC_V ) => NotBreak , // GB7
492
+ ( GC_LV , GC_T ) => NotBreak , // GB7
493
+ ( GC_V , GC_V ) => NotBreak , // GB7
494
+ ( GC_V , GC_T ) => NotBreak , // GB7
495
+ ( GC_LVT , GC_T ) => NotBreak , // GB8
496
+ ( GC_T , GC_T ) => NotBreak , // GB8
497
+ ( _, GC_Extend ) => NotBreak , // GB9
498
+ ( _, GC_ZWJ ) => NotBreak , // GB9
499
+ ( _, GC_SpacingMark ) => Extended , // GB9a
500
+ ( GC_Prepend , _) => Extended , // GB9a
501
+ ( GC_Base , GC_E_Modifier ) => NotBreak , // GB10
502
+ ( GC_E_Base_GAZ , GC_E_Modifier ) => NotBreak , // GB10
503
+ ( GC_Extend , GC_E_Modifier ) => Emoji , // GB10
504
+ ( GC_ZWJ , GC_Glue_After_Zwj ) => NotBreak , // GB11
505
+ ( GC_Regional_Indicator , GC_Regional_Indicator ) => Regional , // GB12, GB13
506
+ ( _, _) => Break , // GB999
507
+ }
508
+ }
509
+
510
+ impl GraphemeCursor {
511
+ pub fn new ( offset : usize , len : usize , is_extended : bool ) -> GraphemeCursor {
512
+ use tables:: grapheme as gr;
513
+ let state = if offset == 0 || offset == len {
514
+ GraphemeCursorState :: Break
515
+ } else {
516
+ GraphemeCursorState :: Unknown
517
+ } ;
518
+ GraphemeCursor {
519
+ offset : offset,
520
+ len : len,
521
+ state : state,
522
+ is_extended : is_extended,
523
+ cat : None ,
524
+ catb : None ,
525
+ pre_context_offset : None ,
526
+ ris_count : None ,
527
+ }
528
+ }
529
+
530
+ pub fn provide_context ( & mut self , chunk : & str , chunk_start : usize ) {
531
+ use tables:: grapheme as gr;
532
+ assert ! ( chunk_start + chunk. len( ) == self . pre_context_offset. unwrap( ) ) ;
533
+ self . pre_context_offset = None ;
534
+ if self . is_extended && chunk_start + chunk. len ( ) == self . offset {
535
+ let ch = chunk. chars ( ) . rev ( ) . next ( ) . unwrap ( ) ;
536
+ if gr:: grapheme_category ( ch) == gr:: GC_Prepend {
537
+ self . decide ( false ) ;
538
+ return ;
539
+ }
540
+ }
541
+ match self . state {
542
+ GraphemeCursorState :: CheckCrlf => {
543
+ let is_break = chunk. as_bytes ( ) [ chunk. len ( ) - 1 ] != b'\r' ;
544
+ self . decide ( is_break) ;
545
+ }
546
+ GraphemeCursorState :: Regional => self . handle_regional ( chunk, chunk_start) ,
547
+ GraphemeCursorState :: Emoji => self . handle_emoji ( chunk, chunk_start) ,
548
+ _ => panic ! ( "invalid state" )
549
+ }
550
+ }
551
+
552
+ fn decide ( & mut self , is_break : bool ) {
553
+ self . state = if is_break {
554
+ GraphemeCursorState :: Break
555
+ } else {
556
+ GraphemeCursorState :: NotBreak
557
+ } ;
558
+ }
559
+
560
+ fn decision ( & mut self , is_break : bool ) -> Result < bool , GraphemeIncomplete > {
561
+ self . decide ( is_break) ;
562
+ Ok ( is_break)
563
+ }
564
+
565
+ fn is_boundary_result ( & self ) -> Result < bool , GraphemeIncomplete > {
566
+ if self . state == GraphemeCursorState :: Break {
567
+ Ok ( true )
568
+ } else if self . state == GraphemeCursorState :: NotBreak {
569
+ Ok ( false )
570
+ } else if let Some ( pre_context_offset) = self . pre_context_offset {
571
+ Err ( GraphemeIncomplete :: PreContext ( pre_context_offset) )
572
+ } else {
573
+ unreachable ! ( "inconsistent state" ) ;
574
+ }
575
+ }
576
+
577
+ fn handle_regional ( & mut self , chunk : & str , chunk_start : usize ) {
578
+ use tables:: grapheme as gr;
579
+ let mut ris_count = self . ris_count . unwrap_or ( 0 ) ;
580
+ for ch in chunk. chars ( ) . rev ( ) {
581
+ if gr:: grapheme_category ( ch) != gr:: GC_Regional_Indicator {
582
+ self . ris_count = Some ( ris_count) ;
583
+ self . decide ( ( ris_count & 1 ) == 0 ) ;
584
+ return ;
585
+ }
586
+ ris_count += 1 ;
587
+ }
588
+ self . ris_count = Some ( ris_count) ;
589
+ if chunk_start == 0 {
590
+ self . decide ( ( ris_count & 1 ) == 0 ) ;
591
+ return ;
592
+ }
593
+ self . pre_context_offset = Some ( chunk_start) ;
594
+ }
595
+
596
+ fn handle_emoji ( & mut self , chunk : & str , chunk_start : usize ) {
597
+ use tables:: grapheme as gr;
598
+ for ch in chunk. chars ( ) . rev ( ) {
599
+ match gr:: grapheme_category ( ch) {
600
+ gr:: GC_Extend => ( ) ,
601
+ gr:: GC_E_Base | gr:: GC_E_Base_GAZ => {
602
+ self . decide ( false ) ;
603
+ return ;
604
+ }
605
+ _ => {
606
+ self . decide ( true ) ;
607
+ return ;
608
+ }
609
+ }
610
+ }
611
+ if chunk_start == 0 {
612
+ self . decide ( true ) ;
613
+ return ;
614
+ }
615
+ self . pre_context_offset = Some ( chunk_start) ;
616
+ }
617
+
618
+ pub fn is_boundary ( & mut self , chunk : & str , chunk_start : usize ) -> Result < bool , GraphemeIncomplete > {
619
+ use tables:: grapheme as gr;
620
+ if self . state == GraphemeCursorState :: Break {
621
+ return Ok ( true )
622
+ }
623
+ if self . state == GraphemeCursorState :: NotBreak {
624
+ return Ok ( false )
625
+ }
626
+ if self . offset < chunk_start || self . offset >= chunk_start + chunk. len ( ) {
627
+ return Err ( GraphemeIncomplete :: InvalidOffset )
628
+ }
629
+ if let Some ( pre_context_offset) = self . pre_context_offset {
630
+ return Err ( GraphemeIncomplete :: PreContext ( pre_context_offset) ) ;
631
+ }
632
+ let offset_in_chunk = self . offset - chunk_start;
633
+ if self . catb . is_none ( ) {
634
+ let ch = chunk[ offset_in_chunk..] . chars ( ) . next ( ) . unwrap ( ) ;
635
+ self . catb = Some ( gr:: grapheme_category ( ch) ) ;
636
+ }
637
+ if self . offset == chunk_start {
638
+ match self . catb . unwrap ( ) {
639
+ gr:: GC_Control => {
640
+ if chunk. as_bytes ( ) [ offset_in_chunk] == b'\n' {
641
+ self . state = GraphemeCursorState :: CheckCrlf ;
642
+ }
643
+ }
644
+ gr:: GC_Regional_Indicator => self . state = GraphemeCursorState :: Regional ,
645
+ gr:: GC_E_Modifier => self . state = GraphemeCursorState :: Emoji ,
646
+ _ => ( )
647
+ }
648
+ self . pre_context_offset = Some ( chunk_start) ;
649
+ return Err ( GraphemeIncomplete :: PreContext ( chunk_start) ) ;
650
+ }
651
+ if self . cat . is_none ( ) {
652
+ let ch = chunk[ ..offset_in_chunk] . chars ( ) . rev ( ) . next ( ) . unwrap ( ) ;
653
+ self . cat = Some ( gr:: grapheme_category ( ch) ) ;
654
+ }
655
+ match check_pair ( self . cat . unwrap ( ) , self . catb . unwrap ( ) ) {
656
+ PairResult :: NotBreak => return self . decision ( false ) ,
657
+ PairResult :: Break => return self . decision ( true ) ,
658
+ PairResult :: Extended => {
659
+ let is_extended = self . is_extended ;
660
+ return self . decision ( is_extended) ;
661
+ }
662
+ PairResult :: CheckCrlf => {
663
+ if chunk. as_bytes ( ) [ offset_in_chunk] != b'\n' {
664
+ return self . decision ( true ) ;
665
+ }
666
+ if self . offset > chunk_start {
667
+ return self . decision ( chunk. as_bytes ( ) [ offset_in_chunk - 1 ] != b'\r' ) ;
668
+ }
669
+ self . state = GraphemeCursorState :: CheckCrlf ;
670
+ return Err ( GraphemeIncomplete :: PreContext ( chunk_start) ) ;
671
+ }
672
+ PairResult :: Regional => {
673
+ self . handle_regional ( & chunk[ ..offset_in_chunk] , chunk_start) ;
674
+ self . is_boundary_result ( )
675
+ }
676
+ PairResult :: Emoji => {
677
+ self . handle_emoji ( & chunk[ ..offset_in_chunk] , chunk_start) ;
678
+ self . is_boundary_result ( )
679
+ }
680
+ }
681
+ }
682
+
683
+ }
0 commit comments