diff options
author | Louie S <louie@example.com> | 2023-03-11 20:35:06 -0800 |
---|---|---|
committer | Louie S <louie@example.com> | 2023-03-13 21:31:09 -0700 |
commit | ba2e24c9a7beb20281af7357f18b18aa94bd108c (patch) | |
tree | 28099cf206287f36aa5bf9234b395b02e273e47a /src/index.sh | |
parent | 7fb869868f708f28d5351cce0ccfe0d9e183568e (diff) |
Improve HTML parsing; index the index entries
Diffstat (limited to 'src/index.sh')
-rwxr-xr-x | src/index.sh | 92 |
1 files changed, 67 insertions, 25 deletions
diff --git a/src/index.sh b/src/index.sh index 0a2a76c..345c9b3 100755 --- a/src/index.sh +++ b/src/index.sh @@ -1,23 +1,16 @@ #!/usr/bin/env sh -DB_PATH="$1" -shift +create_table() { + sqlite3 "$DB_PATH" "CREATE TABLE IF NOT EXISTS searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);" + sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX IF NOT EXISTS anchor ON searchIndex (name, type, path);" +} get_title() { FILE="$1" - PATTERN="<title>.*\(Bison.*\).*</title>" - - #Find pattern in file - grep -Eo "$PATTERN" "$FILE" | - #Remove tag - sed 's/<[^>]*>//g' | \ - #Remove '(automake)' + pup -p -f "$FILE" 'title text{}' | \ sed 's/(Bison.*)//g' | \ - #Remove trailing space - sed 's/[ ]*$//g' | \ - #Replace '&' with '&' - sed 's/&/&/g' + sed 's/\"/\"\"/g' } insert() { @@ -28,16 +21,65 @@ insert() { sqlite3 "$DB_PATH" "INSERT INTO searchIndex(name, type, path) VALUES (\"$NAME\",\"$TYPE\",\"$PAGE_PATH\");" } -# Create table -sqlite3 "$DB_PATH" "CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);" -sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);" - -# Get title and insert into table for each html file -while [ -n "$1" ]; do - unset PAGE_NAME - PAGE_NAME="$(get_title "$1")" - if [ -n "$PAGE_NAME" ]; then - insert "$PAGE_NAME" "Guide" "$(basename "$1")" - fi - shift +insert_term() { + LINK="$1" + NAME="$(echo "$LINK" | pup -p 'a text{}' | sed 's/"/\"\"/g')" + TYPE="Entry" + PAGE_PATH="$(echo "$LINK" | pup -p 'a attr{href}')" + + insert "$NAME" "$TYPE" "$PAGE_PATH" +} + + +insert_index_terms() { + # Get each term from an index page and insert + while [ -n "$1" ]; do + grep -Eo "<a href.*></a>" "$1" | while read -r line; do + insert_term "$line" + done + + shift + done +} + + +insert_pages() { + # Get title and insert into table for each html file + while [ -n "$1" ]; do + unset PAGE_NAME + PAGE_NAME="$(get_title "$1")" + if [ -n "$PAGE_NAME" ]; then + insert "$PAGE_NAME" "Guide" "$(basename "$1")" + fi + + + shift + done +} + +TYPE="PAGES" + +# Check flags +while true; do + case "$1" in + -i|--index) + TYPE="INDEX" + shift + ;; + *) + break + esac done + +DB_PATH="$1" +shift + +create_table +case "$TYPE" in + PAGES) + insert_pages "$@" + ;; + INDEX) + insert_index_terms "$@" + ;; +esac |