#!/bin/bash
#                       /usr/local/bin/blogpages
# https://crystalfaeries.net/posix/bin/blogpages
# celeste:crystalfaery BLOGPAGES 2020-08-12 11:39:23+00:00
# create a list of website webpages which are none of:
#	igal2 gallery pages (https://crystalfaeries.net/imgs/)
#	server generated index pages for unindexed directories
# scan the listed pages to find each of:
#	href="	links which are EXTERNAL to our website
#	href="	links which are INTERNAL to our website
#	src="	links which are EXTERNAL to our website
#	src="	links which are INTERNAL to our website
# and write those lists into our 'todo' files $HOME/crystalfaeries.net/src/
let help=13	# one less than the line number of this line

case $# in

0)
# create a list of website webpages which are none of:
#	igal2 gallery pages (https://crystalfaeries.net/imgs/)
#	server generated index pages for unindexed directories

cd -P	$HOME	# there's no place like Om

# first filter out the igal2 index pages:
find crystalfaeries.net/	-name '*.html'			\
			\!	-name '.indextemplate2.html'	\
			\!	-name '.slidetemplate2.html'	\
			\!	-name '?.html'			\
			\!	-name '??.html'			\
			\!	-name '???.html'		\
			\!	-name '????.html'		\
		>	/tmp/.webpages.txt

# second recover filtered-out actual webpages
find crystalfaeries.net/fae/					\
				-name '?.html'		-o	\
				-name '??.html'		-o	\
				-name '???.html'	-o	\
				-name '????.html'	>>	\
			/tmp/.webpages.txt

# third organize the list and clean-up
sort -u			/tmp/.webpages.txt		>	\
			/tmp/webpages.txt
rm			/tmp/.webpages.txt

# scan the listed files to find each of:

#	href="	links which are EXTERNAL to our website
grep		'href="'	$(cat /tmp/webpages.txt)	|	\
grep	-v	'^/audio/'					|	\
grep	-v	'^/documents/'					|	\
grep	-v	'^/fae/'					|	\
grep	-v	'^/video/'					|	\
grep	-v	'crystalfaeries.net'				>	\
						crystalfaeries.net/src/.href.external.txt

#	href="	links which are INTERNAL to our website
grep		'href="'		$(cat /tmp/webpages.txt)	|	\
grep	'crystalfaeries.net'		>	crystalfaeries.net/src/.href.internal.txt
grep		'href="'		$(cat /tmp/webpages.txt)	|	\
grep	'/audio/'			>>	crystalfaeries.net/src/.href.internal.txt
grep		'href="'		$(cat /tmp/webpages.txt)	|	\
grep	'/documents/'			>>	crystalfaeries.net/src/.href.internal.txt
grep		'href="'		$(cat /tmp/webpages.txt)	|	\
grep	'/fae/'				>>	crystalfaeries.net/src/.href.internal.txt
grep		'href="'		$(cat /tmp/webpages.txt)	|	\
grep	'^/video/'			>>	crystalfaeries.net/src/.href.internal.txt
ls						crystalfaeries.net/*.html	\
| sed	's/^.*crystalfaeries.net//g'	>>	crystalfaeries.net/src/.href.internal.txt

#	src="	links which are EXTERNAL to our website
grep		'src="'		$(cat /tmp/webpages.txt)	|	\
grep	-v	'src=".thumb_'					|	\
grep	-v	'src="/documents/'				|	\
grep	-v	'src="/imgs/'					>	\
						crystalfaeries.net/src/.images.external.txt

#	src="	links which are INTERNAL to our website
grep		'src="/documents/'	$(cat /tmp/webpages.txt)	>	\
						crystalfaeries.net/src/.images.internal.txt
grep		'src="/imgs/'		$(cat /tmp/webpages.txt)	>>	\
						crystalfaeries.net/src/.images.internal.txt
rm				/tmp/webpages.txt # clean-up
	exit	$?
	;;

*)
	head -n ${help} $0	>&2	# laziest help and version info
	exit	0
	;;
esac
exit	255	# didn't we cover all cases and exit?
