#!/usr/bin/env python """ Last edited: Tue Jan 19, 2010 10:12am This script queries Craigslist job postings in the US using QUERY_STR. Dependencies: mechanize,BeautifulSoup,buzhug,html2text """ import mechanize, os, re, string, smtplib from datetime import datetime, date, timedelta from BeautifulSoup import BeautifulSoup from buzhug import Base from time import sleep from html2text import html2text from email.MIMEText import MIMEText import time # set the global vars ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) QUERY_STR = 'structural -"mechanical engineer" -estimator -drafter -draftsman -SoCore -PT&C -www.robsonforensic.com' DIGEST_SIZE = 40 # number of words to fetch and store for each post MAIL_FROM = "my_email@gmail.com" MAIL_TO = "my_email@gmail.com" MAIL_PASS = "my_password" SMTP_HOST = "smtp.gmail.com" SMTP_PORT = 587 EXPIRE = 31 # number of days after which the ad is deleted from the table CRAIG_ROOT ="http://geo.craigslist.org/iso/us" def prepareDB(logfile): """Prepares database tables""" if not os.path.exists('./db'): os.makedirs('./db') os.chdir('./db') adsdb = Base('./ads') try: adsdb.open() logfile.write('\nOpening existing database...') except IOError: adsdb.create(('PostingID',str), ('dt',datetime), ('title',str), ('location',str), ('url',unicode), ('body',unicode), ('notified',bool) # True if record emailed ) logfile.write('\nOpening new database...') return adsdb def setBrowser(): """Creates a browser instance""" br = mechanize.Browser() # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # create user-agent br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.10) Gecko/2009042523 Ubuntu/9.04 (jaunty) Firefox/3.0.10')] return br def currentSiteList(br,logfile): """Returns a list of Craigslist sites within the US""" # open the root of craigslist sites root = br.open(CRAIG_ROOT) # get a list of site links under
root_html=root.read() root_soup=BeautifulSoup(root_html) divblock = root_soup.findAll(id="list") anchors = divblock[0].findAll('a') # get default location names locs=[] for a in anchors: try: locs.append(a.findChildren()[0].string.strip()) except: locs.append(a.string.strip()) numSites = len(locs) logfile.write('\nFound %d sites nationwide' %numSites) return [l["href"] for l in anchors],locs def executeSearch(br,root): """Fills out Craigslist's form""" r=br.open(root) req=br.click_link(text="arch / engineering") br.open(req) # select the first form on the page try: br.select_form(nr=0) except: # if the form is not found, follow the link to the form req=br.click_link(text="Continue to arch / engineering job postings") br.open(req) br.select_form(nr=0) # execute search br.form['catAbbreviation']=['egr'] br.form['query'] = QUERY_STR br.submit() urlre = re.compile('(/egr/([0-9]+).html)') all_links = [l for l in br.links(url_regex=urlre)] return all_links def insertRecs(local_links,adsdb,br,lo,logfile): """Returns a list of record ids that were inserted on this run""" for link in local_links: # for each resulting link: extract the PostingID from url links and check if the record exists (to reduce the load on Craigslist) postid = link.url.split('/')[-1].split('.')[0] rec = adsdb.select(None,PostingID = postid) if rec: #record exists, no need to fetch it again continue else: #find title ti = link.text #follow the link try: br.follow_link(link) html=br.response().read() soup=BeautifulSoup(html) # find location if exists post_location = str(lo) #assign default post_location litags=soup.findAll('li') for tag in litags: places=re.findall(r'Location:([ a-zA-Z,.]+)', str(tag)) if places: try: post_location = places[0].split(':')[1] except: post_location = places[0] break # find date dates=re.findall(r'Date:([ 0-9-,:A-Z]+)', html) if dates: dates=string.join(dates[0].strip().split()[:2],' ') d=datetime.strptime(dates, "%Y-%m-%d, %H:%M%p") else: d=date.today() # a recently expired post could be lurking here # fetch post url post_url=br.response().geturl() #fetch first 50 words of the post body userbody=soup.findAll(id='userbody') if userbody: #if no body, don't bother processing userbody=unicode(userbody[0]) userbody=string.join(userbody.split(' ')[:DIGEST_SIZE],' ') userbody = html2text(userbody) # see if there's an exact same record in db (a repost?): recs = adsdb.select(None, location = post_location, body = userbody) if len(recs)==0: # if no duplicates found, enter the record into table recid=adsdb.insert(PostingID=postid,dt=d,title=ti,location=post_location,url=post_url,body=userbody,notified=False) except: logfile.write("\nCouldn't follow this link: %s" %str(link)) continue #if for some reason fetching the link fails, continue with next iteration return def composeMesg(adsdb,logfile): """Emails new posts""" recs=adsdb.select_for_update(['dt','title','location','url','body','notified'],notified=False) logfile.write("\nThere's a total of %d records to email" %len(recs)) mesg='' for i in recs: mesg += "\n%s \nDate: %s \nTitle: %s (%s) \nDigest: %s" %(i.url.encode('utf-8'),i.dt.strftime("%A, %d %B %Y, %I:%M%p"),i.title,i.location,i.body.encode('utf-8')) mesg +="\n\n----------------------\n" try: # now try to send an email encoding = 'iso-8859-15' msg = MIMEText(_text=mesg, _charset='charset=%s' % encoding) msg['subject'] = "Craigslist updates" sender = 'craigslistJobs.py <%s>' %(MAIL_FROM) msg['from'] = sender s = smtplib.SMTP(SMTP_HOST,SMTP_PORT) s.ehlo() s.starttls() s.ehlo() s.login(MAIL_FROM,MAIL_PASS) s.sendmail(sender, MAIL_TO, msg.as_string()) s.close() # if success, set notified=True adsdb.update(recs, notified = True) return True except: return False #couldn't send the email def cleandb(adsdb,logfile): """Deletes old records""" today = date.today() delta = timedelta(days=EXPIRE) lastday=today - delta try: #fetch old records that have been included in emailed reports recs=adsdb.select_for_update(None,'dt < stamp and notified',stamp=lastday) logfile.write("number of records to be deleted: %d" %len(recs)) adsdb.delete(recs) adsdb.cleanup() except: logfile.write('\nFailed to delete old files') adsdb.close() return def apprun(): """Main function""" # set the timer start=time.time() os.chdir(ROOT_DIR) #open log file logtime=datetime.now().strftime("%A, %d %B %Y, %I:%M%p") logfile = open('searchCraigslistJobs.log','w') logfile.write(logtime + '\n\n\n') adsdb=prepareDB(logfile) br=setBrowser() links,locs = currentSiteList(br,logfile) linkLocPairs=zip(links,locs) for p in linkLocPairs: l=p[0] #link lo=p[1] #location # follow the link to search form and execute search local_links=executeSearch(br,l) # update the table insertRecs(local_links,adsdb,br,lo,logfile) # format and email the new records sent=composeMesg(adsdb,logfile) # now can log errors if email not sent if sent: logfile.write('\nEmail sent') else: logfile.write('\nEmail failed') #perform database maintenance: delete old records and close shop cleandb(adsdb,logfile) # time the script duration = (time.time() - start)/(60.0**2) logfile.write('\nScript ran for (hrs): ') logfile.write(str(duration)) logfile.close() if __name__ == "__main__": apprun()