aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarris Kaufmann <harris@kaufmann.site>2018-12-18 00:51:08 +0100
committerHarris Kaufmann <harris@kaufmann.site>2018-12-18 00:51:08 +0100
commitf3446d7c9b3ea85d7c00e014987d64d9753b44eb (patch)
tree541c58a32909136bb1bafbad82cf13ecec428bbb
parent5a3ce47b3952a6cc8db30e0c69548046949f6302 (diff)
xi editor .rs
-rw-r--r--word_boundaries.rs215
1 files changed, 215 insertions, 0 deletions
diff --git a/word_boundaries.rs b/word_boundaries.rs
new file mode 100644
index 0000000..c089e7b
--- /dev/null
+++ b/word_boundaries.rs
@@ -0,0 +1,215 @@
+// Copyright 2017 The xi-editor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Segmentation of word boundaries. Note: this current implementation
+//! is intended to work for code. Future work is to make it Unicode aware.
+
+use xi_rope::{Cursor, Rope, RopeInfo};
+
+pub struct WordCursor<'a> {
+ inner: Cursor<'a, RopeInfo>,
+}
+
+impl<'a> WordCursor<'a> {
+ pub fn new(text: &'a Rope, pos: usize) -> WordCursor<'a> {
+ let inner = Cursor::new(text, pos);
+ WordCursor { inner }
+ }
+
+ /// Get previous boundary, and set the cursor at the boundary found.
+ pub fn prev_boundary(&mut self) -> Option<usize> {
+ if let Some(ch) = self.inner.prev_codepoint() {
+ let mut prop = get_word_property(ch);
+ let mut candidate = self.inner.pos();
+ while let Some(prev) = self.inner.prev_codepoint() {
+ let prop_prev = get_word_property(prev);
+ if classify_boundary(prop_prev, prop).is_start() {
+ break;
+ }
+ prop = prop_prev;
+ candidate = self.inner.pos();
+ }
+ self.inner.set(candidate);
+ return Some(candidate);
+ }
+ None
+ }
+
+ /// Get next boundary, and set the cursor at the boundary found.
+ pub fn next_boundary(&mut self) -> Option<usize> {
+ if let Some(ch) = self.inner.next_codepoint() {
+ let mut prop = get_word_property(ch);
+ let mut candidate = self.inner.pos();
+ while let Some(next) = self.inner.next_codepoint() {
+ let prop_next = get_word_property(next);
+ if classify_boundary(prop, prop_next).is_end() {
+ break;
+ }
+ prop = prop_next;
+ candidate = self.inner.pos();
+ }
+ self.inner.set(candidate);
+ return Some(candidate);
+ }
+ None
+ }
+
+ /// Return the selection for the word containing the current cursor. The
+ /// cursor is moved to the end of that selection.
+ pub fn select_word(&mut self) -> (usize, usize) {
+ let initial = self.inner.pos();
+ let init_prop_after = self.inner.next_codepoint().map(get_word_property);
+ self.inner.set(initial);
+ let init_prop_before = self.inner.prev_codepoint().map(get_word_property);
+ let mut start = initial;
+ let init_boundary = if let (Some(pb), Some(pa)) = (init_prop_before, init_prop_after) {
+ classify_boundary_initial(pb, pa)
+ } else {
+ WordBoundary::Both
+ };
+ let mut prop_after = init_prop_after;
+ let mut prop_before = init_prop_before;
+ if prop_after.is_none() {
+ start = self.inner.pos();
+ prop_after = prop_before;
+ prop_before = self.inner.prev_codepoint().map(get_word_property);
+ }
+ while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
+ if start == initial {
+ if init_boundary.is_start() {
+ break;
+ }
+ } else if !init_boundary.is_boundary() {
+ if classify_boundary(pb, pa).is_boundary() {
+ break;
+ }
+ } else if classify_boundary(pb, pa).is_start() {
+ break;
+ }
+ start = self.inner.pos();
+ prop_after = prop_before;
+ prop_before = self.inner.prev_codepoint().map(get_word_property);
+ }
+ self.inner.set(initial);
+ let mut end = initial;
+ prop_after = init_prop_after;
+ prop_before = init_prop_before;
+ if prop_before.is_none() {
+ prop_before = self.inner.next_codepoint().map(get_word_property);
+ end = self.inner.pos();
+ prop_after = self.inner.next_codepoint().map(get_word_property);
+ }
+ while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
+ if end == initial {
+ if init_boundary.is_end() {
+ break;
+ }
+ } else if !init_boundary.is_boundary() {
+ if classify_boundary(pb, pa).is_boundary() {
+ break;
+ }
+ } else if classify_boundary(pb, pa).is_end() {
+ break;
+ }
+ end = self.inner.pos();
+ prop_before = prop_after;
+ prop_after = self.inner.next_codepoint().map(get_word_property);
+ }
+ self.inner.set(end);
+ (start, end)
+ }
+}
+
+#[derive(PartialEq, Eq)]
+enum WordBoundary {
+ Interior,
+ Start, // a boundary indicating the end of a word
+ End, // a boundary indicating the start of a word
+ Both,
+}
+
+impl WordBoundary {
+ fn is_start(&self) -> bool {
+ *self == WordBoundary::Start || *self == WordBoundary::Both
+ }
+
+ fn is_end(&self) -> bool {
+ *self == WordBoundary::End || *self == WordBoundary::Both
+ }
+
+ fn is_boundary(&self) -> bool {
+ *self != WordBoundary::Interior
+ }
+}
+
+fn classify_boundary(prev: WordProperty, next: WordProperty) -> WordBoundary {
+ use self::WordBoundary::*;
+ use self::WordProperty::*;
+ match (prev, next) {
+ (Lf, _) => Both,
+ (_, Lf) => Both,
+ (Space, Other) => Start,
+ (Space, Punctuation) => Start,
+ (Punctuation, Other) => Start,
+ (Other, Space) => End,
+ (Punctuation, Space) => End,
+ (Other, Punctuation) => End,
+ _ => Interior,
+ }
+}
+
+fn classify_boundary_initial(prev: WordProperty, next: WordProperty) -> WordBoundary {
+ use self::WordBoundary::*;
+ use self::WordProperty::*;
+ match (prev, next) {
+ (Lf, Other) => Start,
+ (Other, Lf) => End,
+ (Lf, Space) => Interior,
+ (Lf, Punctuation) => Interior,
+ (Space, Lf) => Interior,
+ (Punctuation, Lf) => Interior,
+ (Space, Punctuation) => Interior,
+ (Punctuation, Space) => Interior,
+ _ => classify_boundary(prev, next),
+ }
+}
+
+#[derive(Copy, Clone)]
+enum WordProperty {
+ Lf,
+ Space,
+ Punctuation,
+ Other, // includes letters and all of non-ascii unicode
+}
+
+fn get_word_property(codepoint: char) -> WordProperty {
+ if codepoint <= ' ' {
+ // TODO: deal with \r
+ if codepoint == '\n' {
+ return WordProperty::Lf;
+ }
+ return WordProperty::Space;
+ } else if codepoint <= '\u{3f}' {
+ // Hardcoded: !"#$%&'()*+,-./:;<=>?
+ if (0xfc00fffe00000000u64 >> (codepoint as u32)) & 1 != 0 {
+ return WordProperty::Punctuation;
+ }
+ } else if codepoint <= '\u{7f}' {
+ // Hardcoded: @[\]^`{|}~
+ if (0x7800000178000001u64 >> ((codepoint as u32) & 0x3f)) & 1 != 0 {
+ return WordProperty::Punctuation;
+ }
+ }
+ WordProperty::Other
+}