diff options
| author | Harris Kaufmann <harris@kaufmann.site> | 2018-12-18 00:51:08 +0100 |
|---|---|---|
| committer | Harris Kaufmann <harris@kaufmann.site> | 2018-12-18 00:51:08 +0100 |
| commit | f3446d7c9b3ea85d7c00e014987d64d9753b44eb (patch) | |
| tree | 541c58a32909136bb1bafbad82cf13ecec428bbb | |
| parent | 5a3ce47b3952a6cc8db30e0c69548046949f6302 (diff) | |
xi editor .rs
| -rw-r--r-- | word_boundaries.rs | 215 |
1 files changed, 215 insertions, 0 deletions
diff --git a/word_boundaries.rs b/word_boundaries.rs new file mode 100644 index 0000000..c089e7b --- /dev/null +++ b/word_boundaries.rs @@ -0,0 +1,215 @@ +// Copyright 2017 The xi-editor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Segmentation of word boundaries. Note: this current implementation +//! is intended to work for code. Future work is to make it Unicode aware. + +use xi_rope::{Cursor, Rope, RopeInfo}; + +pub struct WordCursor<'a> { + inner: Cursor<'a, RopeInfo>, +} + +impl<'a> WordCursor<'a> { + pub fn new(text: &'a Rope, pos: usize) -> WordCursor<'a> { + let inner = Cursor::new(text, pos); + WordCursor { inner } + } + + /// Get previous boundary, and set the cursor at the boundary found. + pub fn prev_boundary(&mut self) -> Option<usize> { + if let Some(ch) = self.inner.prev_codepoint() { + let mut prop = get_word_property(ch); + let mut candidate = self.inner.pos(); + while let Some(prev) = self.inner.prev_codepoint() { + let prop_prev = get_word_property(prev); + if classify_boundary(prop_prev, prop).is_start() { + break; + } + prop = prop_prev; + candidate = self.inner.pos(); + } + self.inner.set(candidate); + return Some(candidate); + } + None + } + + /// Get next boundary, and set the cursor at the boundary found. + pub fn next_boundary(&mut self) -> Option<usize> { + if let Some(ch) = self.inner.next_codepoint() { + let mut prop = get_word_property(ch); + let mut candidate = self.inner.pos(); + while let Some(next) = self.inner.next_codepoint() { + let prop_next = get_word_property(next); + if classify_boundary(prop, prop_next).is_end() { + break; + } + prop = prop_next; + candidate = self.inner.pos(); + } + self.inner.set(candidate); + return Some(candidate); + } + None + } + + /// Return the selection for the word containing the current cursor. The + /// cursor is moved to the end of that selection. + pub fn select_word(&mut self) -> (usize, usize) { + let initial = self.inner.pos(); + let init_prop_after = self.inner.next_codepoint().map(get_word_property); + self.inner.set(initial); + let init_prop_before = self.inner.prev_codepoint().map(get_word_property); + let mut start = initial; + let init_boundary = if let (Some(pb), Some(pa)) = (init_prop_before, init_prop_after) { + classify_boundary_initial(pb, pa) + } else { + WordBoundary::Both + }; + let mut prop_after = init_prop_after; + let mut prop_before = init_prop_before; + if prop_after.is_none() { + start = self.inner.pos(); + prop_after = prop_before; + prop_before = self.inner.prev_codepoint().map(get_word_property); + } + while let (Some(pb), Some(pa)) = (prop_before, prop_after) { + if start == initial { + if init_boundary.is_start() { + break; + } + } else if !init_boundary.is_boundary() { + if classify_boundary(pb, pa).is_boundary() { + break; + } + } else if classify_boundary(pb, pa).is_start() { + break; + } + start = self.inner.pos(); + prop_after = prop_before; + prop_before = self.inner.prev_codepoint().map(get_word_property); + } + self.inner.set(initial); + let mut end = initial; + prop_after = init_prop_after; + prop_before = init_prop_before; + if prop_before.is_none() { + prop_before = self.inner.next_codepoint().map(get_word_property); + end = self.inner.pos(); + prop_after = self.inner.next_codepoint().map(get_word_property); + } + while let (Some(pb), Some(pa)) = (prop_before, prop_after) { + if end == initial { + if init_boundary.is_end() { + break; + } + } else if !init_boundary.is_boundary() { + if classify_boundary(pb, pa).is_boundary() { + break; + } + } else if classify_boundary(pb, pa).is_end() { + break; + } + end = self.inner.pos(); + prop_before = prop_after; + prop_after = self.inner.next_codepoint().map(get_word_property); + } + self.inner.set(end); + (start, end) + } +} + +#[derive(PartialEq, Eq)] +enum WordBoundary { + Interior, + Start, // a boundary indicating the end of a word + End, // a boundary indicating the start of a word + Both, +} + +impl WordBoundary { + fn is_start(&self) -> bool { + *self == WordBoundary::Start || *self == WordBoundary::Both + } + + fn is_end(&self) -> bool { + *self == WordBoundary::End || *self == WordBoundary::Both + } + + fn is_boundary(&self) -> bool { + *self != WordBoundary::Interior + } +} + +fn classify_boundary(prev: WordProperty, next: WordProperty) -> WordBoundary { + use self::WordBoundary::*; + use self::WordProperty::*; + match (prev, next) { + (Lf, _) => Both, + (_, Lf) => Both, + (Space, Other) => Start, + (Space, Punctuation) => Start, + (Punctuation, Other) => Start, + (Other, Space) => End, + (Punctuation, Space) => End, + (Other, Punctuation) => End, + _ => Interior, + } +} + +fn classify_boundary_initial(prev: WordProperty, next: WordProperty) -> WordBoundary { + use self::WordBoundary::*; + use self::WordProperty::*; + match (prev, next) { + (Lf, Other) => Start, + (Other, Lf) => End, + (Lf, Space) => Interior, + (Lf, Punctuation) => Interior, + (Space, Lf) => Interior, + (Punctuation, Lf) => Interior, + (Space, Punctuation) => Interior, + (Punctuation, Space) => Interior, + _ => classify_boundary(prev, next), + } +} + +#[derive(Copy, Clone)] +enum WordProperty { + Lf, + Space, + Punctuation, + Other, // includes letters and all of non-ascii unicode +} + +fn get_word_property(codepoint: char) -> WordProperty { + if codepoint <= ' ' { + // TODO: deal with \r + if codepoint == '\n' { + return WordProperty::Lf; + } + return WordProperty::Space; + } else if codepoint <= '\u{3f}' { + // Hardcoded: !"#$%&'()*+,-./:;<=>? + if (0xfc00fffe00000000u64 >> (codepoint as u32)) & 1 != 0 { + return WordProperty::Punctuation; + } + } else if codepoint <= '\u{7f}' { + // Hardcoded: @[\]^`{|}~ + if (0x7800000178000001u64 >> ((codepoint as u32) & 0x3f)) & 1 != 0 { + return WordProperty::Punctuation; + } + } + WordProperty::Other +} |
