#/bin/sh # extract_urls # Shell script to extract all links from a valid HTML document. # The script needs the URL as only parameter. It uses "lynx -dump" # and a lot of grepping and cutting for extracting the links. # # The script needs GNU-grep for the -A which gives the # following lines after the match. # # Constraints # Max. number of URLs per document is arbitrarily set to 5000. # # Bugs # Its not possible to extract URLs from pages with the word # "References" in a single line. if [ $# -eq 1 ] then url=$1 lynx -hiddenlinks=ignore -dump $url|\ grep -A 5000 "^References$"|\ grep -v "^$"|\ grep -v "^References$"|\ cut -d"." -f2-|\ sed 's/^ *//g'|\ grep -v "^javascript"| grep -v "LYNXIMGMAP:" else echo "Usage: $0 URL" exit 1 fi