#!/usr/bin/env python
"""
Last edited: Tue Jan 19, 2010 10:12am
This script queries Craigslist job postings in the US using QUERY_STR.
Dependencies: mechanize,BeautifulSoup,buzhug,html2text
"""
import mechanize, os, re, string, smtplib
from datetime import datetime, date, timedelta
from BeautifulSoup import BeautifulSoup
from buzhug import Base
from time import sleep
from html2text import html2text
from email.MIMEText import MIMEText
import time
# set the global vars
ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
QUERY_STR = 'structural -"mechanical engineer" -estimator -drafter -draftsman -SoCore -PT&C -www.robsonforensic.com'
DIGEST_SIZE = 40 # number of words to fetch and store for each post
MAIL_FROM = "my_email@gmail.com"
MAIL_TO = "my_email@gmail.com"
MAIL_PASS = "my_password"
SMTP_HOST = "smtp.gmail.com"
SMTP_PORT = 587
EXPIRE = 31 # number of days after which the ad is deleted from the table
CRAIG_ROOT ="http://geo.craigslist.org/iso/us"
def prepareDB(logfile):
"""Prepares database tables"""
if not os.path.exists('./db'):
os.makedirs('./db')
os.chdir('./db')
adsdb = Base('./ads')
try:
adsdb.open()
logfile.write('\nOpening existing database...')
except IOError:
adsdb.create(('PostingID',str),
('dt',datetime),
('title',str),
('location',str),
('url',unicode),
('body',unicode),
('notified',bool) # True if record emailed
)
logfile.write('\nOpening new database...')
return adsdb
def setBrowser():
"""Creates a browser instance"""
br = mechanize.Browser()
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# create user-agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.10) Gecko/2009042523 Ubuntu/9.04 (jaunty) Firefox/3.0.10')]
return br
def currentSiteList(br,logfile):
"""Returns a list of Craigslist sites within the US"""
# open the root of craigslist sites
root = br.open(CRAIG_ROOT)
# get a list of site links under
root_html=root.read()
root_soup=BeautifulSoup(root_html)
divblock = root_soup.findAll(id="list")
anchors = divblock[0].findAll('a')
# get default location names
locs=[]
for a in anchors:
try:
locs.append(a.findChildren()[0].string.strip())
except:
locs.append(a.string.strip())
numSites = len(locs)
logfile.write('\nFound %d sites nationwide' %numSites)
return [l["href"] for l in anchors],locs
def executeSearch(br,root):
"""Fills out Craigslist's form"""
r=br.open(root)
req=br.click_link(text="arch / engineering")
br.open(req)
# select the first form on the page
try:
br.select_form(nr=0)
except: # if the form is not found, follow the link to the form
req=br.click_link(text="Continue to arch / engineering job postings")
br.open(req)
br.select_form(nr=0)
# execute search
br.form['catAbbreviation']=['egr']
br.form['query'] = QUERY_STR
br.submit()
urlre = re.compile('(/egr/([0-9]+).html)')
all_links = [l for l in br.links(url_regex=urlre)]
return all_links
def insertRecs(local_links,adsdb,br,lo,logfile):
"""Returns a list of record ids that were inserted on this run"""
for link in local_links:
# for each resulting link: extract the PostingID from url links and check if the record exists (to reduce the load on Craigslist)
postid = link.url.split('/')[-1].split('.')[0]
rec = adsdb.select(None,PostingID = postid)
if rec: #record exists, no need to fetch it again
continue
else:
#find title
ti = link.text
#follow the link
try:
br.follow_link(link)
html=br.response().read()
soup=BeautifulSoup(html)
# find location if exists
post_location = str(lo) #assign default post_location
litags=soup.findAll('li')
for tag in litags:
places=re.findall(r'Location:([ a-zA-Z,.]+)', str(tag))
if places:
try:
post_location = places[0].split(':')[1]
except:
post_location = places[0]
break
# find date
dates=re.findall(r'Date:([ 0-9-,:A-Z]+)', html)
if dates:
dates=string.join(dates[0].strip().split()[:2],' ')
d=datetime.strptime(dates, "%Y-%m-%d, %H:%M%p")
else:
d=date.today() # a recently expired post could be lurking here
# fetch post url
post_url=br.response().geturl()
#fetch first 50 words of the post body
userbody=soup.findAll(id='userbody')
if userbody: #if no body, don't bother processing
userbody=unicode(userbody[0])
userbody=string.join(userbody.split(' ')[:DIGEST_SIZE],' ')
userbody = html2text(userbody)
# see if there's an exact same record in db (a repost?):
recs = adsdb.select(None, location = post_location, body = userbody)
if len(recs)==0: # if no duplicates found, enter the record into table
recid=adsdb.insert(PostingID=postid,dt=d,title=ti,location=post_location,url=post_url,body=userbody,notified=False)
except:
logfile.write("\nCouldn't follow this link: %s" %str(link))
continue #if for some reason fetching the link fails, continue with next iteration
return
def composeMesg(adsdb,logfile):
"""Emails new posts"""
recs=adsdb.select_for_update(['dt','title','location','url','body','notified'],notified=False)
logfile.write("\nThere's a total of %d records to email" %len(recs))
mesg=''
for i in recs:
mesg += "\n%s \nDate: %s \nTitle: %s (%s) \nDigest: %s" %(i.url.encode('utf-8'),i.dt.strftime("%A, %d %B %Y, %I:%M%p"),i.title,i.location,i.body.encode('utf-8'))
mesg +="\n\n----------------------\n"
try:
# now try to send an email
encoding = 'iso-8859-15'
msg = MIMEText(_text=mesg, _charset='charset=%s' % encoding)
msg['subject'] = "Craigslist updates"
sender = 'craigslistJobs.py <%s>' %(MAIL_FROM)
msg['from'] = sender
s = smtplib.SMTP(SMTP_HOST,SMTP_PORT)
s.ehlo()
s.starttls()
s.ehlo()
s.login(MAIL_FROM,MAIL_PASS)
s.sendmail(sender, MAIL_TO, msg.as_string())
s.close()
# if success, set notified=True
adsdb.update(recs, notified = True)
return True
except:
return False #couldn't send the email
def cleandb(adsdb,logfile):
"""Deletes old records"""
today = date.today()
delta = timedelta(days=EXPIRE)
lastday=today - delta
try:
#fetch old records that have been included in emailed reports
recs=adsdb.select_for_update(None,'dt < stamp and notified',stamp=lastday)
logfile.write("number of records to be deleted: %d" %len(recs))
adsdb.delete(recs)
adsdb.cleanup()
except:
logfile.write('\nFailed to delete old files')
adsdb.close()
return
def apprun():
"""Main function"""
# set the timer
start=time.time()
os.chdir(ROOT_DIR)
#open log file
logtime=datetime.now().strftime("%A, %d %B %Y, %I:%M%p")
logfile = open('searchCraigslistJobs.log','w')
logfile.write(logtime + '\n\n\n')
adsdb=prepareDB(logfile)
br=setBrowser()
links,locs = currentSiteList(br,logfile)
linkLocPairs=zip(links,locs)
for p in linkLocPairs:
l=p[0] #link
lo=p[1] #location
# follow the link to search form and execute search
local_links=executeSearch(br,l)
# update the table
insertRecs(local_links,adsdb,br,lo,logfile)
# format and email the new records
sent=composeMesg(adsdb,logfile)
# now can log errors if email not sent
if sent:
logfile.write('\nEmail sent')
else:
logfile.write('\nEmail failed')
#perform database maintenance: delete old records and close shop
cleandb(adsdb,logfile)
# time the script
duration = (time.time() - start)/(60.0**2)
logfile.write('\nScript ran for (hrs): ')
logfile.write(str(duration))
logfile.close()
if __name__ == "__main__":
apprun()