#gproxy.py, written by Rusty Japikse, at Concepts NREC (3/06). #This code is available as open source under the LGPL license # #About: # gproxy.py is a python based (duh!) cgi script to provide a secure proxy to a Google Mini # server. This script handles securely proxing a connection to different sub-collections # on a Google Mini server. Attempts by end users to set the sub-collection variable in # get requests (&restrict=secret_sub_collection) is stripped by regular expressions and # a default (SUB_COLLECTION Configuration Setting) sub-collection is specified. # # As this script is open source there are no guaranties, if it breaks things, you get to # keep the pieces! That being said, this crawler works quite well in feeding our Google Mini. # #Usage: # 1. Set the configuration variables # 2. Place this script in a server directory as the default file to be served. Configure security for this # directory (e.g. Windows Security Settings for IIS or .htaccess for Apache). # 3. (Test then...) Deploy! # #Other Useful Tips: # For feeding your Google Mini with a list of good URLs, please also look for a script # called CrawlFS.py. This script should be available in the python cookbook at activestate # and at http://www.japikse.com. Hope this helps someone to avoid reinventing the wheel! ################### Configuration Settings ########################################## GMINI_URL='http://gmini/' #URL to your Google Mini, include the trailing slash IMG_URL='' #url of your personalized image FOOTER="""
Intranet | Library Search | Facebook | Timesheets | ACME Corp (External)

Powered by Google

\n""" #custom footer INCLUDE_OLD_TITLE=True #if true, resuse the old title e.g. TITLE - Old Title TITLE='ACME Corportation Fileserver Search' FAVICON='favicon.ico' #path to your favicon.ico file SUB_COLLECTION='mark_cust' #the sub-collection to search from REL_URL='./' #Might be useful if you aren't serving this as the default page ##################################################################################### import re, cgi, urllib def rewrite_html(h): #rewrite the html send to the user to point toward this script, also provides for customization of the page #functional modifications to the page r=re.compile('/search') #change the search target h=r.sub(REL_URL,h) r=re.compile('/basics.html') #rewrite where the basics.html link points and turn it into a get method h=r.sub('%s?basics=basics.html' %REL_URL, h) #cosmetic modifications to the page r=re.compile(r'<[^<]*src{1}[^>]*>', re.IGNORECASE | re.DOTALL) #image urls h=r.sub(IMG_URL, h) r=re.compile(r'


\s*(Powered by Google)+.*

|]+class=footer[^>]*>.*', re.DOTALL) #strip the footer from all pages h=r.sub(FOOTER,h) r=re.compile(r'(.*?)', re.DOTALL) #title and favicon swap title='' if INCLUDE_OLD_TITLE: m=r.search(h) title='- '+m.group()[7:-8] title='%s %s\n\t\n\t' %(TITLE,title,FAVICON,FAVICON) return r.sub(title,h) return h def gmini_url(): #based upon the incoming url, creates the correct query url to send to the gmini server #import cgitb; cgitb.enable() #for debug purposes - comment out for production secure_url=clean_url() get_d=cgi.FormContentDict() url=GMINI_URL if get_d.has_key('basics'): #fetch the basics page url=GMINI_URL+'basics.html' elif get_d: #a search is being run url=GMINI_URL+'search?'+secure_url return url def clean_url(): #strip the sub-collection request from the query string before passing it #onto the gmini server - this should stop attempts by smart / hacker users #to query other sub-collections url='' if cgi.os.environ.has_key('QUERY_STRING'): url='&'+cgi.os.environ['QUERY_STRING'] r=re.compile(r'&restrict=?[^&]*|&basics=basics.html', re.IGNORECASE) #remove sub collection requests as well as requests for the basics.html page url=r.sub('',url) url+='&restrict=%s' %SUB_COLLECTION return url.strip('&') if __name__=='__main__': print "Content-type: text/html" print html=urllib.urlopen(gmini_url()).read() print rewrite_html(html)