#!/bin/bash 

# changetrack--Tracks a given URL and, if it's changed since the last
#   visit, emails the new page to the specified address.

sendmail=$(which sendmail)         
sitearchive="/tmp/changetrack"
tmpchanges="$sitearchive/changes.$$"  # Temp file
fromaddr="webscraper@intuitive.com"
dirperm=755        # read+write+execute for dir owner
fileperm=644       # read+write for owner, read only for others

trap "$(which rm) -f $tmpchanges" 0 1 15  # Remove temp file on exit

if [ $# -ne 2 ] ; then
  echo "Usage: $(basename $0) url email" >&2
  echo "  tip: to have changes displayed on screen, use email addr '-'" >&2
  exit 1
fi

if [ ! -d $sitearchive ] ; then
  if ! mkdir $sitearchive ; then
    echo "$(basename $0) failed: couldn't create $sitearchive." >&2
    exit 1
  fi
  chmod $dirperm $sitearchive
fi

if [ "$(echo $1 | cut -c1-5)" != "http:" ] ; then
  echo "Please use fully qualified URLs (e.g. start with 'http://')" >&2
  exit 1
fi

fname="$(echo $1 | sed 's/http:\/\///g' | tr '/?&' '...')"
baseurl="$(echo $1 | cut -d/ -f1-3)/"

# Grab a copy of the Web page and put it in an archive file. Note that we 
#   can track changes by looking just at the content (that is, -dump, not
#   -source), so we can skip any HTML parsing....

lynx  -dump "$1" | uniq > $sitearchive/${fname}.new

if [ -f "$sitearchive/$fname" ] ; then
  # We've seen this site before, so compare the two with diff.
  diff $sitearchive/$fname $sitearchive/${fname}.new > $tmpchanges
  if [ -s $tmpchanges ] ; then
    echo "Status: Site $1 has changed since our last check."
  else
    echo "Status: No changes for site $1 since last check"
    rm -f $sitearchive/${fname}.new     # Nothing new...
    exit 0                              # No change--we're outta here.
  fi
else
  echo "Status: first visit to $1. Copy archived for future analysis."
  mv $sitearchive/${fname}.new $sitearchive/$fname
  chmod $fileperm $sitearchive/$fname
  exit 0
fi

# If we're here, the site has changed, and we need to send the contents
#   of the .new file to the user and replace the original with the .new
#   for the next invocation of the script.

if [ "$2" != "-" ] ; then

( echo "Content-type: text/html"
  echo "From: $fromaddr (Web Site Change Tracker)"
  echo "Subject: Web Site $1 Has Changed"
  echo "To: $2"
  echo ""

   curl -s -dump $1 | \
   sed -e "s|src=\"|SRC=\"$baseurl|gi" \
       -e "s|href=\"|HREF=\"$baseurl|gi" \
       -e "s|$baseurl\/http:|http:|g"
) | $sendmail -t

else
  # Just showing the differences on the screen– is ugly. Solution?

  diff $sitearchive/$fname $sitearchive/${fname}.new
fi

# Update the saved snapshot of the website.

mv $sitearchive/${fname}.new $sitearchive/$fname
chmod 755 $sitearchive/$fname
exit 0
