xi editor .rs

author: Harris Kaufmann <harris@kaufmann.site> 2018-12-18 00:51:08 +0100
committer: Harris Kaufmann <harris@kaufmann.site> 2018-12-18 00:51:08 +0100
commit: f3446d7c9b3ea85d7c00e014987d64d9753b44eb (patch)
tree: 541c58a32909136bb1bafbad82cf13ecec428bbb
parent: 5a3ce47b3952a6cc8db30e0c69548046949f6302 (diff)
1 files changed, 215 insertions, 0 deletions
diff --git a/word_boundaries.rs b/word_boundaries.rs
new file mode 100644
index 0000000..c089e7b
--- /dev/null
+++ b/word_boundaries.rs
@@ -0,0 +1,215 @@
+// Copyright 2017 The xi-editor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Segmentation of word boundaries. Note: this current implementation
+//! is intended to work for code. Future work is to make it Unicode aware.
+
+use xi_rope::{Cursor, Rope, RopeInfo};
+
+pub struct WordCursor<'a> {
+    inner: Cursor<'a, RopeInfo>,
+}
+
+impl<'a> WordCursor<'a> {
+    pub fn new(text: &'a Rope, pos: usize) -> WordCursor<'a> {
+        let inner = Cursor::new(text, pos);
+        WordCursor { inner }
+    }
+
+    /// Get previous boundary, and set the cursor at the boundary found.
+    pub fn prev_boundary(&mut self) -> Option<usize> {
+        if let Some(ch) = self.inner.prev_codepoint() {
+            let mut prop = get_word_property(ch);
+            let mut candidate = self.inner.pos();
+            while let Some(prev) = self.inner.prev_codepoint() {
+                let prop_prev = get_word_property(prev);
+                if classify_boundary(prop_prev, prop).is_start() {
+                    break;
+                }
+                prop = prop_prev;
+                candidate = self.inner.pos();
+            }
+            self.inner.set(candidate);
+            return Some(candidate);
+        }
+        None
+    }
+
+    /// Get next boundary, and set the cursor at the boundary found.
+    pub fn next_boundary(&mut self) -> Option<usize> {
+        if let Some(ch) = self.inner.next_codepoint() {
+            let mut prop = get_word_property(ch);
+            let mut candidate = self.inner.pos();
+            while let Some(next) = self.inner.next_codepoint() {
+                let prop_next = get_word_property(next);
+                if classify_boundary(prop, prop_next).is_end() {
+                    break;
+                }
+                prop = prop_next;
+                candidate = self.inner.pos();
+            }
+            self.inner.set(candidate);
+            return Some(candidate);
+        }
+        None
+    }
+
+    /// Return the selection for the word containing the current cursor. The
+    /// cursor is moved to the end of that selection.
+    pub fn select_word(&mut self) -> (usize, usize) {
+        let initial = self.inner.pos();
+        let init_prop_after = self.inner.next_codepoint().map(get_word_property);
+        self.inner.set(initial);
+        let init_prop_before = self.inner.prev_codepoint().map(get_word_property);
+        let mut start = initial;
+        let init_boundary = if let (Some(pb), Some(pa)) = (init_prop_before, init_prop_after) {
+            classify_boundary_initial(pb, pa)
+        } else {
+            WordBoundary::Both
+        };
+        let mut prop_after = init_prop_after;
+        let mut prop_before = init_prop_before;
+        if prop_after.is_none() {
+            start = self.inner.pos();
+            prop_after = prop_before;
+            prop_before = self.inner.prev_codepoint().map(get_word_property);
+        }
+        while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
+            if start == initial {
+                if init_boundary.is_start() {
+                    break;
+                }
+            } else if !init_boundary.is_boundary() {
+                if classify_boundary(pb, pa).is_boundary() {
+                    break;
+                }
+            } else if classify_boundary(pb, pa).is_start() {
+                break;
+            }
+            start = self.inner.pos();
+            prop_after = prop_before;
+            prop_before = self.inner.prev_codepoint().map(get_word_property);
+        }
+        self.inner.set(initial);
+        let mut end = initial;
+        prop_after = init_prop_after;
+        prop_before = init_prop_before;
+        if prop_before.is_none() {
+            prop_before = self.inner.next_codepoint().map(get_word_property);
+            end = self.inner.pos();
+            prop_after = self.inner.next_codepoint().map(get_word_property);
+        }
+        while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
+            if end == initial {
+                if init_boundary.is_end() {
+                    break;
+                }
+            } else if !init_boundary.is_boundary() {
+                if classify_boundary(pb, pa).is_boundary() {
+                    break;
+                }
+            } else if classify_boundary(pb, pa).is_end() {
+                break;
+            }
+            end = self.inner.pos();
+            prop_before = prop_after;
+            prop_after = self.inner.next_codepoint().map(get_word_property);
+        }
+        self.inner.set(end);
+        (start, end)
+    }
+}
+
+#[derive(PartialEq, Eq)]
+enum WordBoundary {
+    Interior,
+    Start, // a boundary indicating the end of a word
+    End,   // a boundary indicating the start of a word
+    Both,
+}
+
+impl WordBoundary {
+    fn is_start(&self) -> bool {
+        *self == WordBoundary::Start || *self == WordBoundary::Both
+    }
+
+    fn is_end(&self) -> bool {
+        *self == WordBoundary::End || *self == WordBoundary::Both
+    }
+
+    fn is_boundary(&self) -> bool {
+        *self != WordBoundary::Interior
+    }
+}
+
+fn classify_boundary(prev: WordProperty, next: WordProperty) -> WordBoundary {
+    use self::WordBoundary::*;
+    use self::WordProperty::*;
+    match (prev, next) {
+        (Lf, _) => Both,
+        (_, Lf) => Both,
+        (Space, Other) => Start,
+        (Space, Punctuation) => Start,
+        (Punctuation, Other) => Start,
+        (Other, Space) => End,
+        (Punctuation, Space) => End,
+        (Other, Punctuation) => End,
+        _ => Interior,
+    }
+}
+
+fn classify_boundary_initial(prev: WordProperty, next: WordProperty) -> WordBoundary {
+    use self::WordBoundary::*;
+    use self::WordProperty::*;
+    match (prev, next) {
+        (Lf, Other) => Start,
+        (Other, Lf) => End,
+        (Lf, Space) => Interior,
+        (Lf, Punctuation) => Interior,
+        (Space, Lf) => Interior,
+        (Punctuation, Lf) => Interior,
+        (Space, Punctuation) => Interior,
+        (Punctuation, Space) => Interior,
+        _ => classify_boundary(prev, next),
+    }
+}
+
+#[derive(Copy, Clone)]
+enum WordProperty {
+    Lf,
+    Space,
+    Punctuation,
+    Other, // includes letters and all of non-ascii unicode
+}
+
+fn get_word_property(codepoint: char) -> WordProperty {
+    if codepoint <= ' ' {
+        // TODO: deal with \r
+        if codepoint == '\n' {
+            return WordProperty::Lf;
+        }
+        return WordProperty::Space;
+    } else if codepoint <= '\u{3f}' {
+        // Hardcoded: !"#$%&'()*+,-./:;<=>?
+        if (0xfc00fffe00000000u64 >> (codepoint as u32)) & 1 != 0 {
+            return WordProperty::Punctuation;
+        }
+    } else if codepoint <= '\u{7f}' {
+        // Hardcoded: @[\]^`{|}~
+        if (0x7800000178000001u64 >> ((codepoint as u32) & 0x3f)) & 1 != 0 {
+            return WordProperty::Punctuation;
+        }
+    }
+    WordProperty::Other
+}
author	Harris Kaufmann <harris@kaufmann.site>	2018-12-18 00:51:08 +0100
committer	Harris Kaufmann <harris@kaufmann.site>	2018-12-18 00:51:08 +0100
commit	f3446d7c9b3ea85d7c00e014987d64d9753b44eb (patch)
tree	541c58a32909136bb1bafbad82cf13ecec428bbb
parent	5a3ce47b3952a6cc8db30e0c69548046949f6302 (diff)