From 14419f066973d26192bea9b97707bd872af772c7 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Thu, 2 Mar 2023 17:09:02 -0500 Subject: initial commit --- .gitignore | 1 + README.md | 16 +++++++ input.txt | 17 +++++++ section-parse-insane.py | 40 ++++++++++++++++ section-parse-insane.rb | 35 ++++++++++++++ section-parse.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ section-parse.go | 85 ++++++++++++++++++++++++++++++++++ section-parse.py | 62 +++++++++++++++++++++++++ section-parse.rb | 47 +++++++++++++++++++ 9 files changed, 423 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 input.txt create mode 100644 section-parse-insane.py create mode 100644 section-parse-insane.rb create mode 100644 section-parse.c create mode 100644 section-parse.go create mode 100755 section-parse.py create mode 100644 section-parse.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09632a3 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +section-parse diff --git a/README.md b/README.md new file mode 100644 index 0000000..1735fc9 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +Section Parsing Examples +======================== + +Section parsing examples for Alonzo in several languages. + +The files in this directory are as follows: + +* `input.txt`: example input file. +* `README.txt`: top-level documentation. +* `section-parse.c`: C section parser. See header comment for + build command. +* `section-parse.go`: Go section parser. +* `section-parse.py`: Python section parser. +* `section-parse.rb`: Ruby section parser. +* `section-parse-insane.py`: Python section parser (insane version). +* `section-parse-insane.rb`: Ruby section parser (insane version). diff --git a/input.txt b/input.txt new file mode 100644 index 0000000..5a278bc --- /dev/null +++ b/input.txt @@ -0,0 +1,17 @@ +test-text "blah blah blah garbage/foo" +random crap bar.1 +random crap bar.2 +random crap bar.3 +random crap bar.4 + +test-text "blah blah blah garbage/apple" +random crap bar.1 +random crap bar.2 +random crap bar.3 +random crap bar.4 + +test-text "blah blah blah garbage/peach" +random crap bar.1 +random crap bar.2 +random crap bar.3 +random crap bar.4 diff --git a/section-parse-insane.py b/section-parse-insane.py new file mode 100644 index 0000000..e7c6d9e --- /dev/null +++ b/section-parse-insane.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 + +# +# section-parse-insane.py: scan standard input and do the following: +# +# 1. look for a section that begins with "test-text "STUFF/foo" and ends +# with a blank line. +# 2. print the "bar.DIGITS" suffix for each line in the section +# +# Example: +# > cat input.txt +# test-text "blah blah blah garbage/foo" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# +# test-text "blah blah blah garbage/apple" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# > ./section-parse-insane.py < input.txt +# bar.1 +# bar.2 +# bar.3 +# bar.4 +# + +# load libraries +import re +import sys + +# section and row matches +S = re.compile('(?m)^test-text "[^"]+/foo"\n((?:^[^\n]*bar\.\d+\n)+)^\s*\n') +B = re.compile('^.*(bar\.\d+)$') + +# read stdin, extract section body, split by line, map rows to suffix, +# and print to stdout +print("\n".join([B.sub('\\1', s) for s in S.search(sys.stdin.read())[1].strip().split("\n")])) diff --git a/section-parse-insane.rb b/section-parse-insane.rb new file mode 100644 index 0000000..47ee824 --- /dev/null +++ b/section-parse-insane.rb @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby + +# +# section-parse-insane.rb: scan standard input and do the following: +# +# 1. look for a section that begins with "test-text "STUFF/foo" and ends +# with a blank line. +# 2. print the "bar.DIGITS" suffix for each line in the section +# +# Example: +# > cat input.txt +# test-text "blah blah blah garbage/foo" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# +# test-text "blah blah blah garbage/apple" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# > ./section-parse-insane.rb < input.txt +# bar.1 +# bar.2 +# bar.3 +# bar.4 +# + +# section and row matches +S = %r{^test-text "[^"]+/foo"\n((?:^[^\n]*bar\.\d+\n)+)^\s*\n}m +B = /^.*(bar\.\d+)$/ + +# extract section body, split by line, map to suffix, and print result +puts(ARGF.read.scan(S)[0][0].split(/\n/).map { |s| s.gsub(B, '\1') }) diff --git a/section-parse.c b/section-parse.c new file mode 100644 index 0000000..78fda1f --- /dev/null +++ b/section-parse.c @@ -0,0 +1,120 @@ +// +// section-parse: scan standard input and do the following: +// +// 1. look for a section that begins with "test-text "STUFF/foo" and ends +// with a blank line. +// 2. print the "bar.DIGITS" suffix for each line in the section +// +// Build: +// cc -std=c17 -W -Wall -Wextra -Werror -pedantic -O2 -o section-parse{,.c} +// +// Example: +// > cc -std=c17 -W -Wall -Wextra -Werror -pedantic -O2 -o section-parse{,.c} +// > ./section-parse < input.txt +// bar.1 +// bar.2 +// bar.3 +// bar.4 +// + +#include // exit() +#include // fgets(), feof() +#include // memcmp(), strlen() +#include // bool + +// section header line prefix +// (used by is_header()) +static const char HEAD_PREFIX[] = "test-text \""; +static const size_t HEAD_PREFIX_LEN = sizeof(HEAD_PREFIX) - 1; + +// section footer line suffix +// (used by is_header()) +static const char HEAD_SUFFIX[] = "/foo\"\n"; +static const size_t HEAD_SUFFIX_LEN = sizeof(HEAD_SUFFIX) - 1; + +/** + * Return true if the given line is a section header, and false + * otherwise. + */ +static bool is_header(const char * const s) { + const size_t len = strlen(s); + + // check string length + if (len < (HEAD_PREFIX_LEN + HEAD_SUFFIX_LEN)) { + return false; + } + + // compare line prefix and suffix and return true if they both match + return !memcmp(s, HEAD_PREFIX, HEAD_PREFIX_LEN) && + !memcmp(s + len - HEAD_SUFFIX_LEN, HEAD_SUFFIX, HEAD_SUFFIX_LEN); +} + +/** + * Return true if the given line is a section footer, and false + * otherwise. + */ +static bool is_footer(const char * const s) { + const size_t len = strlen(s); + + for (size_t i = 0; i < len; i++) { + if (s[i] != ' ' && s[i] != '\n') { + // found non-space character, return false + return false; + } + } + + // return true + return true; +} + +/** + * Match ".DIGITS" suffix on given line and print "bar.DIGITS" to standard + * output. + * + * Exits with a non-zero error code if the given line is invalid. + */ +static void print_body_line(const char * const s) { + const size_t len = strlen(s); + + // check line length + if (len < 2) { + fprintf(stderr, "short line\n"); + exit(-1); + } + + // find trailing '.' + // (note: technically this should probably use isdigit() too) + size_t pos = len - 2; + while (pos > 0 && s[pos] != '.') { + pos--; + } + + // check to make sure we found trailing '.' + if (s[pos] != '.') { + fprintf(stderr, "invalid line: %s", s); + exit(-1); + } + + // print trailing digits + // (note: match also includes trailing newline) + printf("bar.%s", s + pos + 1); +} + +int main() { + char buf[1024]; // line buffer + bool in_body = false; // parse state + + // read lines from standard input + while (fgets(buf, sizeof(buf), stdin) && !feof(stdin)) { + if ((!in_body && is_header(buf)) || (in_body && is_footer(buf))) { + // toggle state + in_body = !in_body; + } else if (in_body) { + // print body line + print_body_line(buf); + } + } + + // return success + return 0; +} diff --git a/section-parse.go b/section-parse.go new file mode 100644 index 0000000..83c2ed2 --- /dev/null +++ b/section-parse.go @@ -0,0 +1,85 @@ +// +// section-parse.go: scan standard input and do the following: +// +// 1. look for a section that begins with "test-text "STUFF/foo" and ends +// with a blank line. +// 2. print the "bar.DIGITS" suffix for each line in the section +// +// Build: +// +// go build section-parse.go +// +// Alternatively, you can run this script directly like this: +// +// go run ./section-parse.go < input.txt +// +// Example: +// +// > go run ./section-parse.go < input.txt +// bar.1 +// bar.2 +// bar.3 +// bar.4 +// +package main + +import ( + "fmt" + "io" + "log" + "os" + "regexp" + "strings" +) + +// section header regex +var headerRe = regexp.MustCompile(`^test-text ".*/foo"$`) + +// section footer regex +var footerRe = regexp.MustCompile(`^\s*$`) + +// body line regex +var bodyRe = regexp.MustCompile(`^.*(bar\.\d+)$`) + +// Is the given line a section header? +func isHeader(s string) bool { + return headerRe.MatchString(s) +} + +// Is the given line a section footer? +func isFooter(s string) bool { + return footerRe.MatchString(s) +} + +// Read slice of lines from standard input. +func getLines() []string { + // read from standard input + buf, err := io.ReadAll(os.Stdin) + if err != nil { + log.Fatal(err) + } + + // split into lines, return result + return strings.Split(string(buf), "\n") +} + +func main() { + inBody := false + + // walk lines + for _, line := range(getLines()) { + if (!inBody && isHeader(line)) || (inBody && isFooter(line)) { + // toggle state + inBody = !inBody + } else if inBody { + // match digits, check for error + md := bodyRe.FindStringSubmatch(line) + if len(md) < 2 { + log.Fatalf("invalid body line: %s", line) + } + + // print match to standard output + fmt.Println(md[1]) + } + } +} diff --git a/section-parse.py b/section-parse.py new file mode 100755 index 0000000..f2633dc --- /dev/null +++ b/section-parse.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 + +# +# section-parse.py: scan standard input and do the following: +# +# 1. look for a section that begins with "test-text "STUFF/foo" and ends +# with a blank line. +# 2. print the "bar.DIGITS" suffix for each line in the section +# +# Example: +# +# > cat input.txt +# test-text "blah blah blah garbage/foo" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# +# test-text "blah blah blah garbage/apple" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# +# ... (more stuff omitted for brevity) ... +# > ./section-parse.py < input.txt +# bar.1 +# bar.2 +# bar.3 +# bar.4 +# + +# load libraries +import re +import sys + +# regex to match section header line +# (e.g. a line that looks like this: test-text "GARBAGE/foo" +HEAD = re.compile('test-text ".*/foo"') + +# regex to match section footer +# (e.g. a blank line or line containing only spaces) +FOOT = re.compile('^\s*$') + +# regex to match "bar.DIGITS" suffix on body lines +# (used +BODY = re.compile('^.*(bar\.\d+)$') + +# parsing state: are we in the body of the relevant section? +in_body = False + +# read lines from standard input +for line in sys.stdin.readlines(): + # strip trailing newline + line = line.strip() + + if (not in_body and HEAD.match(line)) or (in_body and FOOT.match(line)): + # line is section header or footer, toggle state + in_body = not in_body + elif in_body: + # in section body, match and print "bar.DIGITS" suffix + print(BODY.sub('\\1', line)) diff --git a/section-parse.rb b/section-parse.rb new file mode 100644 index 0000000..0d63f38 --- /dev/null +++ b/section-parse.rb @@ -0,0 +1,47 @@ +#!/usr/bin/env ruby + +# +# section-parse.rb: scan standard input and do the following: +# +# 1. look for a section that begins with "test-text "STUFF/foo" and ends +# with a blank line. +# 2. print the "bar.DIGITS" suffix for each line in the section +# +# Example: +# > cat input.txt +# test-text "blah blah blah garbage/foo" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# +# test-text "blah blah blah garbage/apple" +# random crap bar.1 +# random crap bar.2 +# random crap bar.3 +# random crap bar.4 +# > ./section-parse.rb < input.txt +# bar.1 +# bar.2 +# bar.3 +# bar.4 +# + +# default state +in_body = false + +# read lines +ARGF.readlines.map { |s| s.strip }.each do |s| + # check if either of the following conditions are true: + # + # 1. we are outside the section and the current line is the matching + # section header + # 2. we are in the section body and the current line is empty + if (!in_body && s =~ /^test-text ".*\/foo"$/) || (in_body && s =~ /^\s*$/) + # toggle state + in_body = !in_body + elsif in_body + # print "bar.NUMBERS" line suffix + puts s.gsub(/.*(bar\.\d+)$/, '\1') + end +end -- cgit v1.2.3