#gproxy.py, written by Rusty Japikse, at Concepts NREC (3/06).
#This code is available as open source under the LGPL license
#
#About:
# gproxy.py is a python based (duh!) cgi script to provide a secure proxy to a Google Mini
# server. This script handles securely proxing a connection to different sub-collections
# on a Google Mini server. Attempts by end users to set the sub-collection variable in
# get requests (&restrict=secret_sub_collection) is stripped by regular expressions and
# a default (SUB_COLLECTION Configuration Setting) sub-collection is specified.
#
# As this script is open source there are no guaranties, if it breaks things, you get to
# keep the pieces! That being said, this crawler works quite well in feeding our Google Mini.
#
#Usage:
# 1. Set the configuration variables
# 2. Place this script in a server directory as the default file to be served. Configure security for this
# directory (e.g. Windows Security Settings for IIS or .htaccess for Apache).
# 3. (Test then...) Deploy!
#
#Other Useful Tips:
# For feeding your Google Mini with a list of good URLs, please also look for a script
# called CrawlFS.py. This script should be available in the python cookbook at activestate
# and at http://www.japikse.com. Hope this helps someone to avoid reinventing the wheel!
################### Configuration Settings ##########################################
GMINI_URL='http://gmini/' #URL to your Google Mini, include the trailing slash
IMG_URL='' #url of your personalized image
FOOTER="""
\n""" #custom footer
INCLUDE_OLD_TITLE=True #if true, resuse the old title e.g. TITLE - Old Title
TITLE='ACME Corportation Fileserver Search'
FAVICON='favicon.ico' #path to your favicon.ico file
SUB_COLLECTION='mark_cust' #the sub-collection to search from
REL_URL='./' #Might be useful if you aren't serving this as the default page
#####################################################################################
import re, cgi, urllib
def rewrite_html(h):
#rewrite the html send to the user to point toward this script, also provides for customization of the page
#functional modifications to the page
r=re.compile('/search') #change the search target
h=r.sub(REL_URL,h)
r=re.compile('/basics.html') #rewrite where the basics.html link points and turn it into a get method
h=r.sub('%s?basics=basics.html' %REL_URL, h)
#cosmetic modifications to the page
r=re.compile(r'<[^<]*src{1}[^>]*>', re.IGNORECASE | re.DOTALL) #image urls
h=r.sub(IMG_URL, h)
r=re.compile(r'
\s*(Powered by Google)+.*
|
]+class=footer[^>]*>.*
', re.DOTALL) #strip the footer from all pages
h=r.sub(FOOTER,h)
r=re.compile(r'(.*?)', re.DOTALL) #title and favicon swap
title=''
if INCLUDE_OLD_TITLE:
m=r.search(h)
title='- '+m.group()[7:-8]
title='%s %s\n\t\n\t' %(TITLE,title,FAVICON,FAVICON)
return r.sub(title,h)
return h
def gmini_url():
#based upon the incoming url, creates the correct query url to send to the gmini server
#import cgitb; cgitb.enable() #for debug purposes - comment out for production
secure_url=clean_url()
get_d=cgi.FormContentDict()
url=GMINI_URL
if get_d.has_key('basics'): #fetch the basics page
url=GMINI_URL+'basics.html'
elif get_d: #a search is being run
url=GMINI_URL+'search?'+secure_url
return url
def clean_url():
#strip the sub-collection request from the query string before passing it
#onto the gmini server - this should stop attempts by smart / hacker users
#to query other sub-collections
url=''
if cgi.os.environ.has_key('QUERY_STRING'):
url='&'+cgi.os.environ['QUERY_STRING']
r=re.compile(r'&restrict=?[^&]*|&basics=basics.html', re.IGNORECASE) #remove sub collection requests as well as requests for the basics.html page
url=r.sub('',url)
url+='&restrict=%s' %SUB_COLLECTION
return url.strip('&')
if __name__=='__main__':
print "Content-type: text/html"
print
html=urllib.urlopen(gmini_url()).read()
print rewrite_html(html)