From d61d1ffc231b5b2b7bc4131f42679097d354f611 Mon Sep 17 00:00:00 2001 From: Louie S Date: Sat, 11 Mar 2023 21:16:18 -0800 Subject: Use pup for HTML parsing; index the index pages --- Makefile | 10 +++++- README | 1 + src/index.sh | 105 ++++++++++++++++++++++++++++++++++++++++------------------- 3 files changed, 82 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 344a120..397d539 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,15 @@ $(INFO_PLIST_FILE): src/Info.plist $(CONTENTS_DIR) $(INDEX_FILE): src/index.sh $(DOCUMENTS_DIR) rm -f $@ src/index.sh $@ $(DOCUMENTS_DIR)/*.html - #ruby src/index.rb $(DOCUMENTS_DIR)/*.html | sqlite3 $@ + src/index.sh -i "Entry" $@ $(DOCUMENTS_DIR)/Concept-Index.html + src/index.sh -i "Macro" $@ $(DOCUMENTS_DIR)/M4-Macro-Index.html + src/index.sh -i "Macro" $@ $(DOCUMENTS_DIR)/Autoconf-Macro-Index.html + src/index.sh -i "Macro" $@ $(DOCUMENTS_DIR)/Autotest-Macro-Index.html + src/index.sh -i "Variable" $@ $(DOCUMENTS_DIR)/Cache-Variable-Index.html + src/index.sh -i "Variable" $@ $(DOCUMENTS_DIR)/Output-Variable-Index.html + src/index.sh -i "Function" $@ $(DOCUMENTS_DIR)/Program-_0026-Function-Index.html + src/index.sh -i "Entry" $@ $(DOCUMENTS_DIR)/Preprocessor-Symbol-Index.html + src/index.sh -i "Variable" $@ $(DOCUMENTS_DIR)/Environment-Variable-Index.html $(ICON_FILE): src/icon.png $(DOCSET_DIR) cp src/icon.png $@ diff --git a/README b/README index 1e64579..09134fb 100644 --- a/README +++ b/README @@ -12,4 +12,5 @@ Requirements: - any POSIX-compliant shell - curl - make +- pup - sqlite3 diff --git a/src/index.sh b/src/index.sh index e81a0f7..8132287 100755 --- a/src/index.sh +++ b/src/index.sh @@ -1,25 +1,16 @@ #!/usr/bin/env sh -DB_PATH="$1" -shift +create_table() { + sqlite3 "$DB_PATH" "CREATE TABLE IF NOT EXISTS searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);" + sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX IF NOT EXISTS anchor ON searchIndex (name, type, path);" +} get_title() { FILE="$1" - PATTERN=".*\(Autoconf\).*" - - #Find pattern in file - grep -Eo "$PATTERN" "$FILE" | - #Remove tag - sed 's/<[^>]*>//g' | \ - #Remove '(automake)' + pup -p -f "$FILE" 'title text{}' | \ sed 's/(Autoconf)//g' | \ - #Remove trailing space - sed 's/[ ]*$//g' | \ - #Replace '&' with '&' - sed 's/&/&/g' | \ - #Replace '<' with '<' - sed 's/</" "$1" | while read -r line; do + insert_term "$line" + done + + shift + done +} + + +insert_pages() { + # Get title and insert into table for each html file + while [ -n "$1" ]; do + unset PAGE_NAME + unset PAGE_TYPE + PAGE_NAME="$(get_title "$1")" + if [ -n "$PAGE_NAME" ]; then + PAGE_TYPE="$(get_type "$1")" + #get_type "$1" + if [ -z "$PAGE_TYPE" ]; then + PAGE_TYPE="Guide" + fi + #echo "$PAGE_TYPE" + insert "$PAGE_NAME" "$PAGE_TYPE" "$(basename "$1")" fi - #echo "$PAGE_TYPE" - insert "$PAGE_NAME" "$PAGE_TYPE" "$(basename "$1")" - fi - shift + shift + done +} + +insert_term() { + LINK="$1" + NAME="$(echo "$LINK" | pup -p 'a text{}' | sed 's/"/\"\"/g')" + TYPE="$INDEX_TYPE" + PAGE_PATH="$(echo "$LINK" | pup -p 'a attr{href}')" + + insert "$NAME" "$TYPE" "$PAGE_PATH" +} + +TYPE="PAGES" + +# Check flags +while true; do + case "$1" in + -i|--index) + TYPE="INDEX" + shift + INDEX_TYPE="$1" + shift + ;; + *) + break + esac done + +DB_PATH="$1" +shift + +create_table +case "$TYPE" in + PAGES) + insert_pages "$@" + ;; + INDEX) + insert_index_terms "$@" + ;; +esac -- cgit