#!/usr/bin/python3
import threading, psycopg2, feedparser
from psycopg2.extras import execute_values
craigslist_feeds = [
"https://detroit.craigslist.org/search/cto?format=rss",
"https://seattle.craigslist.org/search/cto?format=rss",
"https://boston.craigslist.org/search/cto?format=rss",
"https://washingtondc.craigslist.org/search/cto?format=rss",
"https://houston.craigslist.org/search/cto?format=rss",
"https://losangeles.craigslist.org/search/cto?format=rss",
"https://stlouis.craigslist.org/search/cto?format=rss",
"https://up.craigslist.org/search/cto?format=rss"
]
def fetch_rss(url):
con = psycopg2.connect(dbname="",user="")
cur = con.cursor()
rss = feedparser.parse(url)
parsed_data = [(i["title"],i["link"]) for i in rss["entries"]]
execute_values(cur,"insert into craigslist (title,link) values %s",parsed_data)
con.commit()
con.close()
cur.close()
return
for i in craigslist_feeds:
t = threading.Thread(target=fetch_rss,args=(i,))
t.start()
Downloaded EPA fuel economy data from here. All I really needed from this data is the make, model year, and vehicle class information.
The problem is that Craigslist titles aren't consistent, and are sometimes missing information. It's not uncommon, for example, to see an ad for a "2013 F150" instead of "2013 Ford F150". So how to determine the make and vehicle class on incomplete titles? Answer is to use machine learning (knn, NB) to find the most likely make and class given a Craigslist title.
So I need to take the EPA data and train a model, then predict for each Craigslist title. Ongoing project.
No comments:
Post a Comment