summaryrefslogtreecommitdiff
path: root/src/index.sh
diff options
context:
space:
mode:
authorLouie S <louie@example.com>2023-03-11 20:35:06 -0800
committerLouie S <louie@example.com>2023-03-13 21:31:09 -0700
commitba2e24c9a7beb20281af7357f18b18aa94bd108c (patch)
tree28099cf206287f36aa5bf9234b395b02e273e47a /src/index.sh
parent7fb869868f708f28d5351cce0ccfe0d9e183568e (diff)
Improve HTML parsing; index the index entries
Diffstat (limited to 'src/index.sh')
-rwxr-xr-xsrc/index.sh92
1 files changed, 67 insertions, 25 deletions
diff --git a/src/index.sh b/src/index.sh
index 0a2a76c..345c9b3 100755
--- a/src/index.sh
+++ b/src/index.sh
@@ -1,23 +1,16 @@
#!/usr/bin/env sh
-DB_PATH="$1"
-shift
+create_table() {
+ sqlite3 "$DB_PATH" "CREATE TABLE IF NOT EXISTS searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
+ sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX IF NOT EXISTS anchor ON searchIndex (name, type, path);"
+}
get_title() {
FILE="$1"
- PATTERN="<title>.*\(Bison.*\).*</title>"
-
- #Find pattern in file
- grep -Eo "$PATTERN" "$FILE" |
- #Remove tag
- sed 's/<[^>]*>//g' | \
- #Remove '(automake)'
+ pup -p -f "$FILE" 'title text{}' | \
sed 's/(Bison.*)//g' | \
- #Remove trailing space
- sed 's/[ ]*$//g' | \
- #Replace '&amp' with '&'
- sed 's/&amp/&/g'
+ sed 's/\"/\"\"/g'
}
insert() {
@@ -28,16 +21,65 @@ insert() {
sqlite3 "$DB_PATH" "INSERT INTO searchIndex(name, type, path) VALUES (\"$NAME\",\"$TYPE\",\"$PAGE_PATH\");"
}
-# Create table
-sqlite3 "$DB_PATH" "CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
-sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);"
-
-# Get title and insert into table for each html file
-while [ -n "$1" ]; do
- unset PAGE_NAME
- PAGE_NAME="$(get_title "$1")"
- if [ -n "$PAGE_NAME" ]; then
- insert "$PAGE_NAME" "Guide" "$(basename "$1")"
- fi
- shift
+insert_term() {
+ LINK="$1"
+ NAME="$(echo "$LINK" | pup -p 'a text{}' | sed 's/"/\"\"/g')"
+ TYPE="Entry"
+ PAGE_PATH="$(echo "$LINK" | pup -p 'a attr{href}')"
+
+ insert "$NAME" "$TYPE" "$PAGE_PATH"
+}
+
+
+insert_index_terms() {
+ # Get each term from an index page and insert
+ while [ -n "$1" ]; do
+ grep -Eo "<a href.*></a>" "$1" | while read -r line; do
+ insert_term "$line"
+ done
+
+ shift
+ done
+}
+
+
+insert_pages() {
+ # Get title and insert into table for each html file
+ while [ -n "$1" ]; do
+ unset PAGE_NAME
+ PAGE_NAME="$(get_title "$1")"
+ if [ -n "$PAGE_NAME" ]; then
+ insert "$PAGE_NAME" "Guide" "$(basename "$1")"
+ fi
+
+
+ shift
+ done
+}
+
+TYPE="PAGES"
+
+# Check flags
+while true; do
+ case "$1" in
+ -i|--index)
+ TYPE="INDEX"
+ shift
+ ;;
+ *)
+ break
+ esac
done
+
+DB_PATH="$1"
+shift
+
+create_table
+case "$TYPE" in
+ PAGES)
+ insert_pages "$@"
+ ;;
+ INDEX)
+ insert_index_terms "$@"
+ ;;
+esac