summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLouie S <louie@example.com>2023-03-11 21:16:18 -0800
committerLouie S <louie@example.com>2023-03-11 21:16:18 -0800
commitd61d1ffc231b5b2b7bc4131f42679097d354f611 (patch)
treebe060ae01b41525165e0bbd8e1b6339f860c5095 /src
parent20aa42ad2c7740d64247e900e02638317b650209 (diff)
Use pup for HTML parsing; index the index pages
Diffstat (limited to 'src')
-rwxr-xr-xsrc/index.sh105
1 files changed, 72 insertions, 33 deletions
diff --git a/src/index.sh b/src/index.sh
index e81a0f7..8132287 100755
--- a/src/index.sh
+++ b/src/index.sh
@@ -1,25 +1,16 @@
#!/usr/bin/env sh
-DB_PATH="$1"
-shift
+create_table() {
+ sqlite3 "$DB_PATH" "CREATE TABLE IF NOT EXISTS searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
+ sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX IF NOT EXISTS anchor ON searchIndex (name, type, path);"
+}
get_title() {
FILE="$1"
- PATTERN="<title>.*\(Autoconf\).*</title>"
-
- #Find pattern in file
- grep -Eo "$PATTERN" "$FILE" |
- #Remove tag
- sed 's/<[^>]*>//g' | \
- #Remove '(automake)'
+ pup -p -f "$FILE" 'title text{}' | \
sed 's/(Autoconf)//g' | \
- #Remove trailing space
- sed 's/[ ]*$//g' | \
- #Replace '&amp' with '&'
- sed 's/&amp/&/g' | \
- #Replace '&lt;' with '<'
- sed 's/&lt;/</g'
+ sed 's/\"/\"\"/g'
}
get_type() {
@@ -44,23 +35,71 @@ insert() {
sqlite3 "$DB_PATH" "INSERT INTO searchIndex(name, type, path) VALUES (\"$NAME\",\"$TYPE\",\"$PAGE_PATH\");"
}
-# Create table
-sqlite3 "$DB_PATH" "CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);"
-sqlite3 "$DB_PATH" "CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);"
-
-# Get title and insert into table for each html file
-while [ -n "$1" ]; do
- unset PAGE_NAME
- unset PAGE_TYPE
- PAGE_NAME="$(get_title "$1")"
- if [ -n "$PAGE_NAME" ]; then
- PAGE_TYPE="$(get_type "$1")"
- #get_type "$1"
- if [ -z "$PAGE_TYPE" ]; then
- PAGE_TYPE="Guide"
+insert_index_terms() {
+ # Get each term from an index page and insert
+ while [ -n "$1" ]; do
+ grep -Eo "<a href.*></a>" "$1" | while read -r line; do
+ insert_term "$line"
+ done
+
+ shift
+ done
+}
+
+
+insert_pages() {
+ # Get title and insert into table for each html file
+ while [ -n "$1" ]; do
+ unset PAGE_NAME
+ unset PAGE_TYPE
+ PAGE_NAME="$(get_title "$1")"
+ if [ -n "$PAGE_NAME" ]; then
+ PAGE_TYPE="$(get_type "$1")"
+ #get_type "$1"
+ if [ -z "$PAGE_TYPE" ]; then
+ PAGE_TYPE="Guide"
+ fi
+ #echo "$PAGE_TYPE"
+ insert "$PAGE_NAME" "$PAGE_TYPE" "$(basename "$1")"
fi
- #echo "$PAGE_TYPE"
- insert "$PAGE_NAME" "$PAGE_TYPE" "$(basename "$1")"
- fi
- shift
+ shift
+ done
+}
+
+insert_term() {
+ LINK="$1"
+ NAME="$(echo "$LINK" | pup -p 'a text{}' | sed 's/"/\"\"/g')"
+ TYPE="$INDEX_TYPE"
+ PAGE_PATH="$(echo "$LINK" | pup -p 'a attr{href}')"
+
+ insert "$NAME" "$TYPE" "$PAGE_PATH"
+}
+
+TYPE="PAGES"
+
+# Check flags
+while true; do
+ case "$1" in
+ -i|--index)
+ TYPE="INDEX"
+ shift
+ INDEX_TYPE="$1"
+ shift
+ ;;
+ *)
+ break
+ esac
done
+
+DB_PATH="$1"
+shift
+
+create_table
+case "$TYPE" in
+ PAGES)
+ insert_pages "$@"
+ ;;
+ INDEX)
+ insert_index_terms "$@"
+ ;;
+esac