فهرست منبع

Initial regex.h implementation

jD91mZM2 6 سال پیش
والد
کامیت
1acc2a1a32
13فایلهای تغییر یافته به همراه238 افزوده شده و 0 حذف شده
  1. 7 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 1 0
      Makefile
  4. 1 0
      include/sys/types.h
  5. 1 0
      src/header/mod.rs
  6. 7 0
      src/header/regex/cbindgen.toml
  7. 183 0
      src/header/regex/mod.rs
  8. 1 0
      src/lib.rs
  9. 1 0
      src/platform/types.rs
  10. 1 0
      tests/Makefile
  11. 0 0
      tests/expected/regex.stderr
  12. 3 0
      tests/expected/regex.stdout
  13. 31 0
      tests/regex.c

+ 7 - 0
Cargo.lock

@@ -148,6 +148,11 @@ name = "num-traits"
 version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
+[[package]]
+name = "posix-regex"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "proc-macro2"
 version = "0.2.3"
@@ -227,6 +232,7 @@ dependencies = [
  "compiler_builtins 0.1.0 (git+https://github.com/rust-lang-nursery/compiler-builtins.git)",
  "core_io 0.1.20180619",
  "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "posix-regex 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "ralloc 1.0.0",
  "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
  "redox_syscall 0.1.40 (git+https://gitlab.redox-os.org/redox-os/syscall.git?branch=relibc)",
@@ -464,6 +470,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
 "checksum log 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d4fcce5fa49cc693c312001daf1d13411c4a5283796bac1084299ea3e567113f"
 "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1"
+"checksum posix-regex 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "58b31ca4f5022c6c0a22206d63c177be2f418355db5a713db22bd901c6ac0db3"
 "checksum proc-macro2 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "cd07deb3c6d1d9ff827999c7f9b04cdfd66b1b17ae508e14fe47b620f2282ae0"
 "checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a"
 "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd"

+ 1 - 0
Cargo.toml

@@ -18,6 +18,7 @@ cc = "1.0.17"
 cbitset = "0.1.0"
 core_io = { path = "core_io", features = ["collections"] }
 lazy_static = { version = "*", features = ["nightly", "spin_no_std"] }
+posix-regex = { version = "0.1", features = ["no_std"] }
 rand = { version = "0.5.2", default-features = false }
 va_list = { path = "va_list", features = ["no_std"] }
 

+ 1 - 0
Makefile

@@ -53,6 +53,7 @@ libc: $(BUILD)/release/libc.a $(BUILD)/release/crt0.o $(BUILD)/release/crti.o $(
 libm: $(BUILD)/openlibm/libopenlibm.a
 
 sysroot: all
+	rm -rf $@
 	rm -rf [email protected]
 	mkdir -p [email protected]
 	make install [email protected]

+ 1 - 0
include/sys/types.h

@@ -20,6 +20,7 @@ typedef long clock_t;
 typedef int clockid_t;
 typedef void* timer_t;
 typedef unsigned long int blkcnt_t;
+typedef size_t regoff_t;
 
 typedef unsigned char u_char, uchar;
 typedef unsigned short u_short, ushort;

+ 1 - 0
src/header/mod.rs

@@ -14,6 +14,7 @@ pub mod netdb;
 pub mod netinet_in;
 //pub mod pthread;
 pub mod pwd;
+pub mod regex;
 pub mod semaphore;
 pub mod setjmp;
 pub mod sgtty;

+ 7 - 0
src/header/regex/cbindgen.toml

@@ -0,0 +1,7 @@
+sys_includes = ["sys/types.h"]
+include_guard = "_TEMPLATE_H"
+language = "C"
+style = "Type"
+
+[enum]
+prefix_with_name = true

+ 183 - 0
src/header/regex/mod.rs

@@ -0,0 +1,183 @@
+//! regex.h implementation, following http://pubs.opengroup.org/onlinepubs/7908799/xsh/regex.h.html
+
+use alloc::borrow::Cow;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::{mem, slice, ptr};
+use header::string::strlen;
+use platform::types::*;
+use posix_regex::{PosixRegexBuilder, PosixRegex};
+use posix_regex::compile::{Error as CompileError, Token, Range};
+
+#[repr(C)]
+pub struct regex_t {
+    // Can't be a normal Vec<T> because then the struct size won't be known
+    // from C.
+    ptr: *mut c_void,
+    length: size_t,
+    capacity: size_t,
+
+    cflags: c_int,
+    re_nsub: size_t
+}
+#[repr(C)]
+pub struct regmatch_t {
+    rm_so: regoff_t,
+    rm_eo: regoff_t
+}
+
+pub const REG_EXTENDED: c_int = 1;
+pub const REG_ICASE:    c_int = 2;
+pub const REG_NOSUB:    c_int = 4;
+pub const REG_NEWLINE:  c_int = 8;
+pub const REG_NOTBOL:   c_int = 16;
+pub const REG_NOTEOL:   c_int = 32;
+
+pub const REG_NOMATCH:  c_int = 1;
+pub const REG_BADPAT:   c_int = 2;
+pub const REG_ECOLLATE: c_int = 3;
+pub const REG_ECTYPE:   c_int = 4;
+pub const REG_EESCAPE:  c_int = 5;
+pub const REG_ESUBREG:  c_int = 6;
+pub const REG_EBRACK:   c_int = 7;
+pub const REG_ENOSYS:   c_int = 8;
+pub const REG_EPAREN:   c_int = 9;
+pub const REG_EBRACE:   c_int = 10;
+pub const REG_BADBR:    c_int = 11;
+pub const REG_ERANGE:   c_int = 12;
+pub const REG_ESPACE:   c_int = 13;
+pub const REG_BADRPT:   c_int = 14;
+
+fn count_groups(branches: &[Vec<(Token, Range)>]) -> usize {
+    let mut count = 0;
+    for branch in branches {
+        for (token, _) in branch {
+            if let Token::Group(ref inner) = token {
+                count += 1 + count_groups(inner);
+            }
+        }
+    }
+    count
+}
+
+#[no_mangle]
+pub extern "C" fn regcomp(out: *mut regex_t, pat: *const c_char, cflags: c_int) -> c_int {
+    if cflags & REG_EXTENDED == REG_EXTENDED {
+        return REG_ENOSYS;
+    }
+
+    let pat = unsafe { slice::from_raw_parts(pat as *const u8, strlen(pat)) };
+    let res = PosixRegexBuilder::new(pat)
+        .with_default_classes()
+        .compile_tokens();
+
+    match res {
+        Ok(mut branches) => unsafe {
+            let re_nsub = count_groups(&branches);
+            *out = regex_t {
+                ptr: branches.as_mut_ptr() as *mut c_void,
+                length: branches.len(),
+                capacity: branches.capacity(),
+
+                cflags,
+                re_nsub,
+            };
+            mem::forget(branches);
+            0
+        },
+        Err(CompileError::EmptyRepetition)
+            | Err(CompileError::IntegerOverflow)
+            | Err(CompileError::IllegalRange) => REG_BADBR,
+        Err(CompileError::UnclosedRepetition) => REG_EBRACE,
+        Err(CompileError::LeadingRepetition) => REG_BADRPT,
+        Err(CompileError::UnknownCollation) => REG_ECOLLATE,
+        Err(CompileError::UnknownClass(_)) => REG_ECTYPE,
+        Err(_) => REG_BADPAT
+    }
+}
+#[no_mangle]
+pub unsafe extern "C" fn regfree(regex: *mut regex_t) {
+    Vec::from_raw_parts(
+        (*regex).ptr as *mut Vec<(Token, Range)>,
+        (*regex).length,
+        (*regex).capacity
+    );
+}
+#[no_mangle]
+pub extern "C" fn regexec(regex: *const regex_t, input: *const c_char,
+                          nmatch: size_t, pmatch: *mut regmatch_t, eflags: c_int) -> c_int {
+    if eflags & REG_EXTENDED == REG_EXTENDED {
+        return REG_ENOSYS;
+    }
+
+    let regex = unsafe { &(*regex) };
+
+    // Allow specifying a compiler argument to the executor and vise versa
+    // because why not?
+    let mut flags = regex.cflags | eflags;
+
+    let input = unsafe { slice::from_raw_parts(input as *const u8, strlen(input)) };
+
+    let branches = unsafe { slice::from_raw_parts(regex.ptr as *const Vec<(Token, Range)>, regex.length) };
+
+    let matches = PosixRegex::new(Cow::Borrowed(&branches))
+        .case_insensitive(flags & REG_ICASE == REG_ICASE)
+        .newline(flags & REG_NEWLINE == REG_NEWLINE)
+        .no_start(flags & REG_NOTBOL == REG_NOTBOL)
+        .no_end(flags & REG_NOTEOL == REG_NOTEOL)
+        .matches(input, Some(1));
+
+    if !matches.is_empty()
+            && eflags & REG_NOSUB != REG_NOSUB
+            && !pmatch.is_null()
+            && nmatch > 0 {
+        let first = &matches[0];
+
+        let len = first.len().min(nmatch as usize);
+        for i in 0..len {
+            let (start, end) = first[i];
+            unsafe {
+                *pmatch.offset(i as isize) = regmatch_t {
+                    rm_so: start,
+                    rm_eo: end
+                };
+            }
+        }
+        for i in len as isize..nmatch as isize {
+            unsafe {
+                *pmatch.offset(i) = regmatch_t {
+                    rm_so: !0,
+                    rm_eo: !0
+                };
+            }
+        }
+    }
+
+    if matches.is_empty() { REG_NOMATCH } else { 0 }
+}
+
+#[no_mangle]
+pub extern "C" fn regerror(code: c_int, _regex: *const regex_t, out: *mut c_char, max: c_int) {
+    let string = match code {
+        0            => "No error\0",
+        REG_NOMATCH  => "No match\0",
+        REG_BADPAT   => "Invalid regexp\0",
+        REG_ECOLLATE => "Unknown collating element\0",
+        REG_ECTYPE   => "Unknown character class name\0",
+        REG_EESCAPE  => "Trailing backslash\0",
+        REG_ESUBREG  => "Invalid back reference\0",
+        REG_EBRACK   => "Missing ']'\0",
+        REG_ENOSYS   => "Unsupported operation\0",
+        REG_EPAREN   => "Missing ')'\0",
+        REG_EBRACE   => "Missing '}'\0",
+        REG_BADBR    => "Invalid contents of {}\0",
+        REG_ERANGE   => "Invalid character range\0",
+        REG_ESPACE   => "Out of memory\0",
+        REG_BADRPT   => "Repetition not preceded by valid expression\0",
+        _ => "Unknown error\0"
+    };
+
+    unsafe {
+        ptr::copy_nonoverlapping(string.as_ptr(), out as *mut u8, string.len().min(max as usize))
+    }
+}

+ 1 - 0
src/lib.rs

@@ -21,6 +21,7 @@ extern crate cbitset;
 extern crate core_io;
 #[macro_use]
 extern crate lazy_static;
+extern crate posix_regex;
 extern crate rand;
 extern crate va_list;
 

+ 1 - 0
src/platform/types.rs

@@ -46,6 +46,7 @@ pub type wchar_t = i32;
 pub type wint_t = u32;
 pub type wctype_t = i64;
 
+pub type regoff_t = size_t;
 pub type off_t = c_long;
 pub type mode_t = c_int;
 pub type time_t = c_long;

+ 1 - 0
tests/Makefile

@@ -11,6 +11,7 @@ EXPECT_BINS=\
 	locale \
 	math \
 	netdb \
+	regex \
 	select \
 	setjmp \
 	signal \

+ 0 - 0
tests/expected/regex.stderr


+ 3 - 0
tests/expected/regex.stdout

@@ -0,0 +1,3 @@
+Matching group: 25 - 36
+Matching group: 31 - 36
+Matching group: -1 - -1

+ 31 - 0
tests/regex.c

@@ -0,0 +1,31 @@
+#include <regex.h>
+#include <stdio.h>
+
+int main() {
+    regex_t regex;
+    char error_buf[256];
+
+    int error = regcomp(&regex, "h.llo \\(w.rld\\)", REG_ICASE);
+    if (error) {
+        regerror(error, &regex, error_buf, 255);
+        error_buf[255] = 0;
+        printf("regcomp error: %d = %s\n", error, error_buf);
+        return -1;
+    }
+
+    regmatch_t matches[3] = { 0 };
+
+    error = regexec(&regex, "Hey, how are you? Hello? Hallo Wurld??", 3, matches, 0);
+
+    regfree(&regex);
+
+    if (error) {
+        regerror(error, &regex, error_buf, 255);
+        printf("regexec error: %d = %s\n", error, error_buf);
+        return -1;
+    }
+
+    for (int group = 0; group < 3; group += 1) {
+        printf("Matching group: %d - %d\n", matches[group].rm_so, matches[group].rm_eo);
+    }
+}