Skip to content

Commit f0df6be

Browse files
committed
Starting to implement rope-capable API
Very much work in progress. See unicode-rs#21
1 parent e86a69b commit f0df6be

File tree

1 file changed

+249
-1
lines changed

1 file changed

+249
-1
lines changed

src/grapheme.rs

Lines changed: 249 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ enum GraphemeState {
101101
Regional,
102102
Emoji,
103103
Zwj,
104+
Unknown,
104105
}
105106

106107
impl<'a> Iterator for Graphemes<'a> {
@@ -226,6 +227,7 @@ impl<'a> Iterator for Graphemes<'a> {
226227
break;
227228
}
228229
},
230+
Unknown => unreachable!(),
229231
}
230232
}
231233

@@ -388,7 +390,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
388390
take_curr = false;
389391
break;
390392
}
391-
}
393+
},
394+
Unknown => unreachable!(),
392395
}
393396
}
394397

@@ -433,3 +436,248 @@ pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
433436
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
434437
GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
435438
}
439+
440+
// maybe unify with PairResult?
441+
#[derive(PartialEq, Eq)]
442+
enum GraphemeCursorState {
443+
Unknown,
444+
NotBreak,
445+
Break,
446+
CheckCrlf,
447+
Regional,
448+
Emoji,
449+
}
450+
451+
pub struct GraphemeCursor {
452+
offset: usize, // current cursor position
453+
len: usize, // total length of the string
454+
is_extended: bool,
455+
state: GraphemeCursorState,
456+
cat: Option<GraphemeCat>, // category of codepoint immediately preceding cursor
457+
catb: Option<GraphemeCat>, // category of codepoint immediately after cursor
458+
pre_context_offset: Option<usize>,
459+
ris_count: Option<usize>,
460+
}
461+
462+
#[derive(PartialEq, Eq)]
463+
pub enum GraphemeIncomplete {
464+
PreContext(usize), // need pre-context for chunk ending at usize
465+
PrevChunk, // requesting chunk previous to the one given
466+
NextChunk, // requesting chunk following the one given
467+
InvalidOffset, // error, chunk given is not inside cursor
468+
}
469+
470+
#[derive(PartialEq, Eq)]
471+
enum PairResult {
472+
NotBreak, // definitely not a break
473+
Break, // definitely a break
474+
Extended, // a break if in extended mode
475+
CheckCrlf, // a break unless it's a CR LF pair
476+
Regional, // a break if preceded by an even number of RIS
477+
Emoji, // a break if preceded by emoji base and extend
478+
}
479+
480+
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
481+
use self::PairResult::*;
482+
use tables::grapheme::GraphemeCat::*;
483+
match (before, after) {
484+
(GC_Control, GC_Control) => CheckCrlf, // GB3
485+
(GC_Control, _) => Break, // GB4
486+
(_, GC_Control) => Break, // GB5
487+
(GC_L, GC_L) => NotBreak, // GB6
488+
(GC_L, GC_V) => NotBreak, // GB6
489+
(GC_L, GC_LV) => NotBreak, // GB6
490+
(GC_L, GC_LVT) => NotBreak, // GB6
491+
(GC_LV, GC_V) => NotBreak, // GB7
492+
(GC_LV, GC_T) => NotBreak, // GB7
493+
(GC_V, GC_V) => NotBreak, // GB7
494+
(GC_V, GC_T) => NotBreak, // GB7
495+
(GC_LVT, GC_T) => NotBreak, // GB8
496+
(GC_T, GC_T) => NotBreak, // GB8
497+
(_, GC_Extend) => NotBreak, // GB9
498+
(_, GC_ZWJ) => NotBreak, // GB9
499+
(_, GC_SpacingMark) => Extended, // GB9a
500+
(GC_Prepend, _) => Extended, // GB9a
501+
(GC_Base, GC_E_Modifier) => NotBreak, // GB10
502+
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
503+
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
504+
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
505+
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
506+
(_, _) => Break, // GB999
507+
}
508+
}
509+
510+
impl GraphemeCursor {
511+
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
512+
use tables::grapheme as gr;
513+
let state = if offset == 0 || offset == len {
514+
GraphemeCursorState::Break
515+
} else {
516+
GraphemeCursorState::Unknown
517+
};
518+
GraphemeCursor {
519+
offset: offset,
520+
len: len,
521+
state: state,
522+
is_extended: is_extended,
523+
cat: None,
524+
catb: None,
525+
pre_context_offset: None,
526+
ris_count: None,
527+
}
528+
}
529+
530+
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
531+
use tables::grapheme as gr;
532+
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
533+
self.pre_context_offset = None;
534+
if self.is_extended && chunk_start + chunk.len() == self.offset {
535+
let ch = chunk.chars().rev().next().unwrap();
536+
if gr::grapheme_category(ch) == gr::GC_Prepend {
537+
self.decide(false);
538+
return;
539+
}
540+
}
541+
match self.state {
542+
GraphemeCursorState::CheckCrlf => {
543+
let is_break = chunk.as_bytes()[chunk.len() - 1] != b'\r';
544+
self.decide(is_break);
545+
}
546+
GraphemeCursorState::Regional => self.handle_regional(chunk, chunk_start),
547+
GraphemeCursorState::Emoji => self.handle_emoji(chunk, chunk_start),
548+
_ => panic!("invalid state")
549+
}
550+
}
551+
552+
fn decide(&mut self, is_break: bool) {
553+
self.state = if is_break {
554+
GraphemeCursorState::Break
555+
} else {
556+
GraphemeCursorState::NotBreak
557+
};
558+
}
559+
560+
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
561+
self.decide(is_break);
562+
Ok(is_break)
563+
}
564+
565+
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
566+
if self.state == GraphemeCursorState::Break {
567+
Ok(true)
568+
} else if self.state == GraphemeCursorState::NotBreak {
569+
Ok(false)
570+
} else if let Some(pre_context_offset) = self.pre_context_offset {
571+
Err(GraphemeIncomplete::PreContext(pre_context_offset))
572+
} else {
573+
unreachable!("inconsistent state");
574+
}
575+
}
576+
577+
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
578+
use tables::grapheme as gr;
579+
let mut ris_count = self.ris_count.unwrap_or(0);
580+
for ch in chunk.chars().rev() {
581+
if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
582+
self.ris_count = Some(ris_count);
583+
self.decide((ris_count & 1) == 0);
584+
return;
585+
}
586+
ris_count += 1;
587+
}
588+
self.ris_count = Some(ris_count);
589+
if chunk_start == 0 {
590+
self.decide((ris_count & 1) == 0);
591+
return;
592+
}
593+
self.pre_context_offset = Some(chunk_start);
594+
}
595+
596+
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
597+
use tables::grapheme as gr;
598+
for ch in chunk.chars().rev() {
599+
match gr::grapheme_category(ch) {
600+
gr::GC_Extend => (),
601+
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
602+
self.decide(false);
603+
return;
604+
}
605+
_ => {
606+
self.decide(true);
607+
return;
608+
}
609+
}
610+
}
611+
if chunk_start == 0 {
612+
self.decide(true);
613+
return;
614+
}
615+
self.pre_context_offset = Some(chunk_start);
616+
}
617+
618+
pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
619+
use tables::grapheme as gr;
620+
if self.state == GraphemeCursorState::Break {
621+
return Ok(true)
622+
}
623+
if self.state == GraphemeCursorState::NotBreak {
624+
return Ok(false)
625+
}
626+
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
627+
return Err(GraphemeIncomplete::InvalidOffset)
628+
}
629+
if let Some(pre_context_offset) = self.pre_context_offset {
630+
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
631+
}
632+
let offset_in_chunk = self.offset - chunk_start;
633+
if self.catb.is_none() {
634+
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
635+
self.catb = Some(gr::grapheme_category(ch));
636+
}
637+
if self.offset == chunk_start {
638+
match self.catb.unwrap() {
639+
gr::GC_Control => {
640+
if chunk.as_bytes()[offset_in_chunk] == b'\n' {
641+
self.state = GraphemeCursorState::CheckCrlf;
642+
}
643+
}
644+
gr::GC_Regional_Indicator => self.state = GraphemeCursorState::Regional,
645+
gr::GC_E_Modifier => self.state = GraphemeCursorState::Emoji,
646+
_ => ()
647+
}
648+
self.pre_context_offset = Some(chunk_start);
649+
return Err(GraphemeIncomplete::PreContext(chunk_start));
650+
}
651+
if self.cat.is_none() {
652+
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
653+
self.cat = Some(gr::grapheme_category(ch));
654+
}
655+
match check_pair(self.cat.unwrap(), self.catb.unwrap()) {
656+
PairResult::NotBreak => return self.decision(false),
657+
PairResult::Break => return self.decision(true),
658+
PairResult::Extended => {
659+
let is_extended = self.is_extended;
660+
return self.decision(is_extended);
661+
}
662+
PairResult::CheckCrlf => {
663+
if chunk.as_bytes()[offset_in_chunk] != b'\n' {
664+
return self.decision(true);
665+
}
666+
if self.offset > chunk_start {
667+
return self.decision(chunk.as_bytes()[offset_in_chunk - 1] != b'\r');
668+
}
669+
self.state = GraphemeCursorState::CheckCrlf;
670+
return Err(GraphemeIncomplete::PreContext(chunk_start));
671+
}
672+
PairResult::Regional => {
673+
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
674+
self.is_boundary_result()
675+
}
676+
PairResult::Emoji => {
677+
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
678+
self.is_boundary_result()
679+
}
680+
}
681+
}
682+
683+
}

0 commit comments

Comments
 (0)