1. from Tkinter import *
  2.  
  3. from lxml.html import parse
  4. import re
  5.  
  6. curProxy = {}
  7. proxies = []
  8.  
  9. class App:
  10.  
  11. def __init__(self, master):
  12.  
  13. counter = "what"
  14.  
  15. frame = Frame(master)
  16. frame.pack()
  17.  
  18. self.hi_there = Button(frame, text="Scrape", command=self.scrape)
  19. self.hi_there.pack(side=LEFT)
  20.  
  21. self.scrollbar = Scrollbar(frame, orient=VERTICAL)
  22.  
  23. self.listbox = Listbox(frame, yscrollcommand=self.scrollbar.set)
  24. self.scrollbar.config(command=self.listbox.yview)
  25. self.scrollbar.pack(side=RIGHT, fill=Y)
  26.  
  27. self.listbox.pack()
  28.  
  29.  
  30.  
  31. def scrape(self):
  32.  
  33. urls = ['http://www.proxy4free.com/page1.html']
  34. ## 'http://www.proxy4free.com/page2.html',
  35. ## 'http://proxynext.com/proxylist1.php',
  36. ## 'http://proxynext.com/proxylist2.php']
  37.  
  38. for url in urls:
  39.  
  40. doc = parse(url).getroot()
  41. for td in doc.cssselect('td'):
  42.  
  43. ipPattern = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!(-|.|:))")
  44. ipMatch = ipPattern.match(td.text_content())
  45.  
  46. portPattern = re.compile("\d{2,4}(?!(-|.|:))")
  47. portMatch = portPattern.match(td.text_content())
  48.  
  49. if ipMatch:
  50. curProxy['ip'] = td.text_content()
  51.  
  52. elif portMatch:
  53. curProxy['port'] = td.text_content()
  54. print curProxy
  55.  
  56. proxies.append(curProxy)
  57.  
  58. for i in range(20):
  59. self.listbox.insert(END, proxies[i]['ip'] + ':' + proxies[i]['port'])
  60.  
  61. root = Tk()
  62. app = App(root)
  63. root.mainloop()