summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouie S <louie@example.com>2023-03-11 20:35:06 -0800
committerLouie S <louie@example.com>2023-03-13 21:31:09 -0700
commitba2e24c9a7beb20281af7357f18b18aa94bd108c (patch)
tree28099cf206287f36aa5bf9234b395b02e273e47a
parent7fb869868f708f28d5351cce0ccfe0d9e183568e (diff)
Improve HTML parsing; index the index entries
-rw-r--r--.gitignore4
-rw-r--r--Makefile1
-rw-r--r--README1
-rwxr-xr-xsrc/index.sh92
4 files changed, 71 insertions, 27 deletions
diff --git a/.gitignore b/.gitignore
index 9941ba2..4ba0d40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
-/GNU_Make.docset
-/GNU_Make.tgz
+/GNU_Bison.docset
+/GNU_Bison.tgz
/tmp
diff --git a/Makefile b/Makefile
index 5a0205e..8a6a2ad 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@ $(INFO_PLIST_FILE): src/Info.plist $(CONTENTS_DIR)
$(INDEX_FILE): src/index.sh $(DOCUMENTS_DIR)
rm -f $@
src/index.sh $@ $(DOCUMENTS_DIR)/*.html
+ src/index.sh -i $@ $(DOCUMENTS_DIR)/Index-of-Terms.html
$(ICON_FILE): src/icon.png $(DOCSET_DIR)
cp src/icon.png $@
diff --git a/README b/README
index 6f64279..df3447f 100644
--- a/README
+++ b/README
@@ -12,4 +12,5 @@ Requirements:
- any POSIX-compliant shell
- curl
- make
+- pup
- sqlite3
diff --git a/src/index.sh b/src/index.sh
index 0a2a76c..345c9b3 100755
--- a/src/index.sh
+++ b/src/index.sh
@@ -1,23 +1,16 @@
#!/usr/bin/env sh
-DB_PATH="$1"
-shift
+create_table() {
+ sqlite3 "$DB_PATH" "CREATE TABLE IF NOT EXISTS searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
+ sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX IF NOT EXISTS anchor ON searchIndex (name, type, path);"
+}
get_title() {
FILE="$1"
- PATTERN="<title>.*\(Bison.*\).*</title>"
-
- #Find pattern in file
- grep -Eo "$PATTERN" "$FILE" |
- #Remove tag
- sed 's/<[^>]*>//g' | \
- #Remove '(automake)'
+ pup -p -f "$FILE" 'title text{}' | \
sed 's/(Bison.*)//g' | \
- #Remove trailing space
- sed 's/[ ]*$//g' | \
- #Replace '&amp' with '&'
- sed 's/&amp/&/g'
+ sed 's/\"/\"\"/g'
}
insert() {
@@ -28,16 +21,65 @@ insert() {
sqlite3 "$DB_PATH" "INSERT INTO searchIndex(name, type, path) VALUES (\"$NAME\",\"$TYPE\",\"$PAGE_PATH\");"
}
-# Create table
-sqlite3 "$DB_PATH" "CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
-sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);"
-
-# Get title and insert into table for each html file
-while [ -n "$1" ]; do
- unset PAGE_NAME
- PAGE_NAME="$(get_title "$1")"
- if [ -n "$PAGE_NAME" ]; then
- insert "$PAGE_NAME" "Guide" "$(basename "$1")"
- fi
- shift
+insert_term() {
+ LINK="$1"
+ NAME="$(echo "$LINK" | pup -p 'a text{}' | sed 's/"/\"\"/g')"
+ TYPE="Entry"
+ PAGE_PATH="$(echo "$LINK" | pup -p 'a attr{href}')"
+
+ insert "$NAME" "$TYPE" "$PAGE_PATH"
+}
+
+
+insert_index_terms() {
+ # Get each term from an index page and insert
+ while [ -n "$1" ]; do
+ grep -Eo "<a href.*></a>" "$1" | while read -r line; do
+ insert_term "$line"
+ done
+
+ shift
+ done
+}
+
+
+insert_pages() {
+ # Get title and insert into table for each html file
+ while [ -n "$1" ]; do
+ unset PAGE_NAME
+ PAGE_NAME="$(get_title "$1")"
+ if [ -n "$PAGE_NAME" ]; then
+ insert "$PAGE_NAME" "Guide" "$(basename "$1")"
+ fi
+
+
+ shift
+ done
+}
+
+TYPE="PAGES"
+
+# Check flags
+while true; do
+ case "$1" in
+ -i|--index)
+ TYPE="INDEX"
+ shift
+ ;;
+ *)
+ break
+ esac
done
+
+DB_PATH="$1"
+shift
+
+create_table
+case "$TYPE" in
+ PAGES)
+ insert_pages "$@"
+ ;;
+ INDEX)
+ insert_index_terms "$@"
+ ;;
+esac