from Tkinter import *
from lxml.html import parse
import re
curProxy = {}
proxies = []
class App:
def __init__(self, master):
counter = "what"
frame = Frame(master)
frame.pack()
self.hi_there = Button(frame, text="Scrape", command=self.scrape)
self.hi_there.pack(side=LEFT)
self.scrollbar = Scrollbar(frame, orient=VERTICAL)
self.listbox = Listbox(frame, yscrollcommand=self.scrollbar.set)
self.scrollbar.config(command=self.listbox.yview)
self.scrollbar.pack(side=RIGHT, fill=Y)
self.listbox.pack()
def scrape(self):
urls = ['http://www.proxy4free.com/page1.html']
## 'http://www.proxy4free.com/page2.html',
## 'http://proxynext.com/proxylist1.php',
## 'http://proxynext.com/proxylist2.php']
for url in urls:
doc = parse(url).getroot()
for td in doc.cssselect('td'):
ipPattern = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!(-|.|:))")
ipMatch = ipPattern.match(td.text_content())
portPattern = re.compile("\d{2,4}(?!(-|.|:))")
portMatch = portPattern.match(td.text_content())
if ipMatch:
curProxy['ip'] = td.text_content()
elif portMatch:
curProxy['port'] = td.text_content()
print curProxy
proxies.append(curProxy)
for i in range(20):
self.listbox.insert(END, proxies[i]['ip'] + ':' + proxies[i]['port'])
root = Tk()
app = App(root)
root.mainloop()