#!/usr/bin/regina
/*
 vim:ts=4:noet:wrap:
 $Id:$ 
 * Rick Younie <younie@debian.org>
-+-
USAGE: searchgeo [-l list1 [list2]..] -p pattern
  -b  - download bodies too
  -D  - save the list names to a local file for quicker startup
  -H  - pull up the mail list home page in browser
  -k  - keep the hits file on exit (always kept if Netscape is the browser)
  -l  - lists to search, default devel; all="devel mentors user"
  -L <pat> - print the lists that contain <pat>  '-L "*"' to see all lists
  -p  - search pattern
  -t  - socket routine timeout in seconds, default 60

  ^C breaks off the search and shows the hits to that point if it can

EXAMPLE:
   searchgeo -l devel policy user -p murdock perens jackson -b
-*-
 * 
 * 
 */
	TRACE OFF
	SIGNAL ON HALT NAME BREAK_C
	SIGNAL OFF ERROR
	SIGNAL ON FAILURE
	SIGNAL ON NOVALUE
	SIGNAL ON SYNTAX

/* -------------------------------------------------------------------
 *	  constants and assigns
 */
	g.			= ''
	g.$lf		= '0a'x
	g.$cr		= '0d'x
	g.$crlf		= '0d 0a'x
	g.$tab		= '09'x
	g.$wipe		= '1b'x || '[K' || g.$cr
	/* temp file delete checks if browser is Netscape
	 * and it may not be set yet if ^C
	 */
	g.$browser	= ''

	g.$all		= 'devel mentors user'
	site		= 'www.geocrawler.com'
	default_browser = '/usr/bin/lynx'

/* -------------------------------------------------------------------
 *
 */
Main:
	parse arg rgs

	/* ----------------------------------------------
	 *	handle environment variables
	 * ---------------------------------------------*/

	/* make name of config file for list names */
	homedir = value('HOME',,'SYSTEM')
	if homedir = '' then call EX 1,'..please set HOME env var'
	localdir = homedir'/.searchscripts'
	localnames = localdir'/geo_names'

	/* set browser */
	if value('SSBROWSER',,'SYSTEM') <> ''
		then browser = value('SSBROWSER',,'SYSTEM')
	else do
		if value('DISPLAY',,'SYSTEM') <> ''
			then browser = value('X11BROWSER',,'SYSTEM')
		else browser = value('CONSOLEBROWSER',,'SYSTEM')
	end
	if browser = '' then browser = default_browser
	'type >/dev/null 2>&1' browser	/* it exists after all that? */
	if RC <> 0 then call EX 1,'..you will have to install lynx',
		'or set *BROWSER - please see man page'
	if pos('netscape', browser) <> 0 then g.$keep = 1

	call value 'REGINA_MACROS', '/usr/lib/searchscripts','SYSTEM'

	/* ----------------------------------------------
	 *	parse arguments
	 * ---------------------------------------------*/

	/* print lists and exit if no args */
	if rgs = '-h' | rgs = '--help' then signal USAGE
	else if rgs = '-H' then do
		call DOBROWSER 'http://'site
		exit 0
	end
	/* allow missing -p if that's the only switch */
	else if pos(' -',rgs) = 0 then rgs = '-p' rgs

	/* parse given args */
	listpatterns = ''
	rgs = ' 'rgs
	opt. = 0
	do while rgs <> ''
		parse var rgs ' -' option ' -' +0 rgs
		parse var option opt val
		select
			when opt = 'b' then g.$bodiestoo = 1
			when opt = 'D' then downloadnames = 1
			when opt = 'k' then g.$keep = 1
			when translate(opt) = 'L' then do
				if opt = 'L' then opt.$findmatch = 1
				if val = '*' | val = '?' then opt.$justprint = 1
				listpatterns = val
			end
			when opt = 'p' then g.pattern = val
			when opt = 't' then g.$timeout = val
			otherwise call EX 1,'..arg error "-'opt'"'
		end
	end
	if \datatype(g.$timeout,'NUM') then g.$timeout = 60

    posall = wordpos('all',listpatterns)
    if posall <> 0 then listpatterns = g.$all delword(listpatterns,posall,1)
	lists = listpatterns

	/* save list names -> local file and exit */
	if value('downloadnames') = '1' then do
		if stream(localdir,'C','QUERY EXISTS') = '' then do
			'mkdir' localdir
			if RC <> 0 then call EX 1,'..could not create' localdir
		end

		'rm 2>/dev/null' localnames
		call GET_LISTNAMES

		do i = 1 to words(all_lists)
			list = word(all_lists,i)
			call charout localnames, list mlurls.list' '
			if i // 50 = 0 then call lineout localnames,''
		end
		exit
	end

	if opt.$justprint then do
		call GET_LISTNAMES
        exit
    end

	/* load names from local file if exists else pull/scrape the page */
	if stream(localnames,'C','QUERY EXISTS') <> '' then do
		lists_urls = ''
		do while lines(localnames) <> 0
			lists_urls = lists_urls linein(localnames)
		end
		call stream localnames,'C','CLOSE'

		all_lists = ''
		mlurls.list = ''
		do i = 1 to words(lists_urls) by 2
			list = word(lists_urls,i)
			all_lists = all_lists list
			mlurls.list = word(lists_urls,i+1)
		end
		all_lists = all_lists' '
	end
    else call GET_LISTNAMES

	if opt.$findmatch then do
		call FIND_MATCH lists
		exit
	end

	g.pattern = strip(g.pattern, 'B')
	if g.pattern = '' then call EX 1,'..no pattern given'

	if g.$bodiestoo <> 1 then g.$bodiestoo = 0
	else do
		urlindex = 0; sectionnum = 0
		call popen 'tempfile -p geo__ -d /tmp -s .html 2>/dev/null', 'tfile.'
		if tfile.0 <> 1 then call EX 1,'..could not create tempfile in /tmp'
		bodyfile = tfile.1
	end

	if lists = '' then listnames = 'debian-devel'	  /* default */
	else do
		listnames = ''
		do i = 1 to words(lists)
			list = word(lists, i)
			if pos(' debian-'list' ',all_lists) <> 0 then list = 'debian-'list
			else if pos(' 'list' ',all_lists) = 0 then call EX 1,'..no list "'list'"'
			listnames = listnames list
		end i
	end

	/* fix non-alphanumeric chars in the search pattern */
	transpat = ''
	do i = 1 to length(g.pattern)
		char = substr(g.pattern, i, 1)
		select
			when char >= 'a' & char <= 'z' then nop
			when char >= 'A' & char <= 'Z' then nop
			when char >= '0' & char <= '9' then nop
			otherwise char = '%'c2x(char)
		end
		transpat = transpat || char
	end i

	/* tempfile; section urls if -b, else all hits */
	call popen 'tempfile -p geo__ -d /tmp -s .html 2>/dev/null', 'tfile.'
	if tfile.0 <> 1 then call EX 1,'..could not create tempfile in /tmp'
	savefile = tfile.1
	call lineout savefile,,
		'<html>'g.$lf'<head>'g.$lf'<title>Geocrawler Search Results</title>'
	call lineout savefile, '</head>'g.$lf ||,
	  '<body background="" bgcolor="#FFFFFF" text="#000000">'

	/* ----------------------------------------------
	 *	main
	 * ---------------------------------------------*/

	got_hits = 0				/* no browser display if no hits */
	SIGNAL OFF HALT				/* necessary? */
	SIGNAL ON HALT NAME LEAVEJ	/* catch ^C	*/
	do j = 1 to words(listnames)

		listname = word(listnames,j)
		parse var mlurls.listname listnum '/' .

		call charout ,listname': '
		call lineout savefile, '<pre>'g.$lf||g.$lf'<b>  'listname'  </b>'g.$lf

		request = '/search/?config='listnum'&words='transpat

		/* get the first page */
		hits = 0
		msgnumbers.0 = 0
		tagnumbers.0 = 0
		pull_errno = GETPAGE(site,request,g.$timeout,,1)
		call charout ,g.$cr || g.$wipe		 /* lose the dots */
		if pull_errno <> 0 then leave j
 
		/* first pull has the number of pages of hits (10/page) */
		if pos('<FONT FACE="ARIAL,HELVETICA">There were no matches',,
			sockbuff) <> 0 then do
			say listname':   ..no hits       '
			call lineout savefile, '   ..no hits'
			iterate j
		end
		else if pos('<H1>Error', sockbuff) <> 0 then do
			parse var sockbuff '<H1>' error '</H1>'
			say listname': 'error
			call lineout savefile, error
			iterate j
		end

		parse var sockbuff 'Your search for <B>' .,
		  '</B> returned <B>' hits '</B>'
		/* hits='' if geocrawler doesn't return anything (is down f.e.) */
		if \datatype(hits,'NUM') then hits = 0
		if datatype(value('hits'),'NUM') then if hits > 0 then got_hits = 1

		call PROCESS_PAGE

		/* now do the rest of the pages */
		do pages = 2 to hits % 10 + ((hits//10) <> 0)

			pull_errno = GETPAGE(site,request'&page='pages,g.$timeout)
			if pull_errno <> 0 then leave pages

			call PROCESS_PAGE

		end pages

		/* salvage hits so far even if ^C */
		if \g.$bodiestoo then do
			say
			sortstem = 'msgnumbers.'
			call SORT

			do i = 1 to msgnumbers.0
				msgno = msgnumbers.i
				call charout savefile, a.msgno
			end
		end
		/* if url scraping got a ^C, just leave */
		else if value('pull_errno') = 0 then do
			sortstem = 'tagnumbers.'
			call SORT
			body_errno = PULLBODIES()
		end

		if value('pull_errno') = 1 | value('body_errno') = 1 then leave j

	end j

LEAVEJ:
	say

	if symbol('savefile') <> 'LIT' then do
		call stream savefile, 'C', 'CLOSE' /* flush */

		/* if bodiestoo, savefile=indexes, bodyfile=bodies */
		if symbol('bodyfile') <> 'LIT' then do
			call stream bodyfile,'C','CLOSE'
			'cat' bodyfile '>>' savefile
		end
		call lineout savefile, '</body></html>'
		call stream savefile, 'C', 'CLOSE'
		
		if got_hits then call DOBROWSER 'file://'savefile
	end

	call CLEANUP

	exit 0


/* -------------------------------------------------------------------
 */
FIND_MATCH: PROCEDURE EXPOSE g. all_lists

	parse arg pats

	do i = 1 to words(pats)
		pat = word(pats,i)
		call charout ,pat': '
		if pos('*',pat) <> 0 then do
			firstch = left(pat,1)
			lastch = right(pat,1)
			pat = strip(pat,'B','*')
			if firstch = '*' & lastch = '*' then nop
			else if firstch = '*' then pat = pat' '
			else pat = ' 'pat
		end
		mp = pos(pat,all_lists)
		if mp = 0 then  call charout ,'..no match'
		else do while mp <> 0
			hitst = lastpos(' ', all_lists, mp)
			hitend = pos(' ', all_lists, mp + 1)
			call charout ,strip(substr(all_lists,hitst,hitend-hitst),'B')'  '
			mp = pos(pat, all_lists, hitend)
		end
		say
	end i

	return

/* -------------------------------------------------------------------
 * mlurls.(listname) == the relative url for listname 
 * all_lists == all mailing list names (passing on SourceForge lists)
 */
GET_LISTNAMES: PROCEDURE EXPOSE g. opt. site all_lists mlurls.

	url = '/lists/3/'

	all_lists = ''
	mlurls. = ''
	ret = GETPAGE(site,url,g.$timeout)
    if ret = 0 then do
		url_mark = '<A HREF="'
		parse var sockbuff '>Mailing List' '<table' sectbuff '</table>'
		do while pos(url_mark,sectbuff) <> 0
			parse var sectbuff (url_mark) section '">' sectbuff
			section = strip(section,'B')
			if section = 'SourceForge' then iterate

            if opt.$justprint then say g.$lf'  'strip(section,'B')||g.$lf

			bodyurl = url||section

			ret = GETPAGE(site,bodyurl,g.$timeout)
			parse var sockbuff '<H3>' bodybuff '</table>'
			do while pos(url_mark,bodybuff) <> 0
				parse var bodybuff (url_mark) listurl '">' '&nbsp;' listname '</A>' bodybuff
				mlurls.listname = strip(listurl,'B')
				
                listname = strip(listname,'B')
                if opt.$justprint then call charout ,listname'  '
                all_lists = all_lists listname
            end
            if opt.$justprint then say
        end
        all_lists = all_lists' '
    end

	return

/* -------------------------------------------------------------------
 * pull all urls.i pages -> savefile; bodies -> bodyfile
 */
PULLBODIES: PROCEDURE EXPOSE g. savefile bodyfile urls. listname,
	sectionnum tagnumbers.

	SIGNAL ON HALT NAME CTRL_2
	errnum = 0
	do i = 1 to tagnumbers.0

		tag = tagnumbers.i
		call charout ,g.$cr || listname':' right(i,6) tagnumbers.0 || g.$wipe

		parse var urls.tag 'http://' site '/' +0 url
		errnum = GETPAGE(site,url,g.$timeout)
		if errnum <> 0 then leave i

		parse var sockbuff 'Thread:' '<A HREF="' ttag '</H3>' . '<PRE>',
		  msg '</PRE>' .
		threadline = 'Thread: <A HREF="http://www.geocrawler.com'ttag

		/* section urls */
		parse var msg 'FROM:' from '<' 'DATE:' date '&' 'SUBJECT:',
		  subject '<P>'
		subject = strip(subject,'B')
		sectionnum = sectionnum + 1
		out = right(date,10)  '<a href="#SECTION'sectionnum'">'||,
		  left(subject,45)'</A>'
		if length(from) > 22 then out = out left(from,22)
		else out = out from
		call lineout savefile, out
		if i // 10 = 0 then call lineout savefile,''

		/* bodies */
		msg = translate(msg,g.$lf,g.$cr)
		msg = strip(msg,'T',g.$tab)
		msg = strip(msg,'T',g.$lf)
		call lineout bodyfile, '<pre>'g.$lf||g.$lf||g.$lf||,
			'<hr><A NAME="SECTION'sectionnum'"></A><hr>'g.$lf||threadline
		call lineout bodyfile, msg

	end
	say
	tagnumbers.0 = 0

	return errnum <> 0
CTRL_2:
	return 1

/* -------------------------------------------------------------------
 * get a web page with the PULLPAGE external socket function
 */
GETPAGE: PROCEDURE EXPOSE g. sockbuff

	SIGNAL ON HALT NAME CTRL_3
	parse arg site,url,timeout,header,dots

	/* pull a page of hits */
	return = PULLPAGE(site, url, timeout, header, dots)
	parse var return errnum ',' sockbuff

	if pos('<H1>Geocrawler is down for nightly database '||,
	  'maintenance</H1>', sockbuff) <> 0
	then call EX 1,'  ..down for nightly maintenance - try again in 15 minutes'

	if errnum <> 0 then do
		if errnum = -3 then say '  ..no rxsock function library'
		else if errnum = -2 then say '  ..timeout'
		else if errnum > 0 then say '  ..socket error:' errnum
	end

	return errnum <> 0
CTRL_3:
	return 1

/* -------------------------------------------------------------------
 * process the page differently if bodies or not; print progress indicator
 */
PROCESS_PAGE:

	if g.$bodiestoo then do
		call SCRAPE_URLS sockbuff
		currentmark = tagnumbers.0
	end
	else do
		call PARSEPAGE sockbuff
		currentmark = msgnumbers.0
	end

	call charout , g.$cr || listname':' right(currentmark,6) hits || g.$wipe

	return

/* -------------------------------------------------------------------
 * just scrape the msg urls from the page; we pull the bodies later (-b)
 * RETURNS:
 *  tagnumbers.i   (i from 1 -> tagnumbers.0)
 *		- list of tags created from dates in the url, so we can sort by date
 *  urls.j   (j = tagnumbers.i)
 *		- the indices are the tagnumbers described above
 *
 */
SCRAPE_URLS: PROCEDURE EXPOSE g. urls. urlindex tagnumbers. listname hits

	parse arg sockbuff

	do forever
		parse var sockbuff '<i><a href="' url '">' sockbuff
		if url = '' then leave

		/* make a tag to sort the urls by date */
		t = translate(url,' ','/')
		t = subword(t,6)
		parse var t . . . msgno
		sorttag = right('000000000'msgno,9)

		tagnumbers.0 = tagnumbers.0 + 1
		i = tagnumbers.0
		tagnumbers.i = sorttag
		urls.sorttag = url
	end

	return

/* -------------------------------------------------------------------
 * breaks a page up into separate posts (10 on a full page) and
 * adds them to the msgnumbers. array (for ! -b)
 * 
 */
PARSEPAGE: PROCEDURE EXPOSE g. a. msgnumbers.

	parse arg sockbuff

	startmark = '<dl><dt><strong><a href'
	endmark = '</dd></dl>'

	/* individual posts -> msgnumbers. stem variable */
	i = msgnumbers.0
	do forever

		parse var sockbuff (startmark) +0 buff (endmark) sockbuff
		parse var buff '/' chunk '/</a></i>'
		msgno = substr(chunk, lastpos('/',chunk)+1)
		if \datatype(msgno,'NUM') then leave

		parse var buff pre '</strong><img' . (g.$lf) post
		buff = pre || g.$lf || post

		i = i + 1
		msgnumbers.i = msgno
		a.msgno = buff || endmark

	end
	msgnumbers.0 = i

	return

/* -------------------------------------------------------------------
 */
DOBROWSER: PROCEDURE EXPOSE g. browser

	parse arg url

	if pos('netscape', browser) = 0 then browser url
	else do		 /* Netscape */
		lockfile = value('HOME',,'SYSTEM')'/.netscape/lock'
		'[ -h' lockfile ']'
		/* if lockfile is set use existing Netscape */
		if RC = 0 then browser '-remote "openURL('url')" 2>/dev/null'
		else do
			browser url '2>/dev/null&'
			call sleep 5
		end
	end

	return

/* -------------------------------------------------------------------
 * RETURNS:
 *	the sorted stem array
 */
SORT: PROCEDURE EXPOSE g. (sortstem)

	m = 1
	do while (9 * m + 4) < value(sortstem'0')
		m = m * 3 + 1
	end
	do while m > 0
		k = value(sortstem'0') - m
		do j = 1 to k
			q = j
			do while q > 0
				l = q + m
				if value(sortstem'q') >= value(sortstem'l') then leave
				tmp = value(sortstem'q')
				interpret sortstem'q =' sortstem'l'
				interpret sortstem'l = tmp'
				q = q - m
			end
		end
		m = m % 3
	end

	return

/* -------------------------------------------------------------------
 * error trapping, usage
 */
USAGE: 

	do i = 3 to 10
		line = sourceline(i)
		if left(line,3) = '-+-' then do j = i + 1 for 25
			line = sourceline(j)
			if left(line,3) = '-*-' then leave i
			if left(line,1) <> '!' then say line
			else interpret substr(line,2)
		end j
	end i
	return 0

NOVALUE:
FAILURE:
SYNTAX:
ERROR:
	err = condition('C') 'error, line' SIGL': "'condition('D')'"'
	call lineout 'stderr', err
	call CLEANUP
BREAK_C:
	exit 1
EX:
	parse arg ret,err
	call lineout 'stderr', err
	call CLEANUP
	exit ret
CLEANUP:
	if symbol('bodyfile') <> 'LIT' then 'rm 2>/dev/null' bodyfile
	if value('g.$keep') <> '1' then if symbol('savefile') <> 'LIT'
		then 'rm 2>/dev/null' savefile
	return
