# -*- coding: utf-8 -*- from scrapy import Spider from belwue.items import BelwueItem from scrapy.selector import Selector from scrapy.http import Request from urlparse import urljoin from belwue.settings import admindomain from belwue.settings import password class MaillistenSpider(Spider): name = "maillisten" allowed_domains = ["mbox1.belwue.de"] http_user = u"admin" http_pass = password start_urls = [ u"https://mbox1.belwue.de:9010/DomainAdmin/"+admindomain+u"/ObjectList.html?InCluster=1&domainName="+admindomain+u"&" ] def parse(self, response): #filename = response.url.split("/")[-2] #with open(filename, 'wb') as f: # f.write(response.body) items = [] links = [] hxs = Selector(response) xpath = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]' xpathlink = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/@href' links = hxs.xpath(xpathlink).extract() xpathname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/text()' names = hxs.xpath(xpathname).extract() xpathsize = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td[position()=3]/text()' sizes = hxs.xpath(xpathsize).extract() for i in range(0,len(names)): # for i in range(0,1): # gefaehrlich, wenn irgendwo links, names oder sizes nicht richtig abgerufen werden -> ungleiche Groessen item = BelwueItem() item['grouplink'] = links[i] item['groupname'] = names[i] item['groupsize'] = sizes[i] items.append(item) #yield item yield Request(urljoin(response.url, links[i]), meta={'item': item}, callback=self.parse_job) def parse_job(self, response): hxs = Selector(response) item = response.request.meta['item'] # table.settingsBox:nth-child(5) > tbody:nth-child(2) > tr:nth-child(1) > td:nth-child(1) > input:nth-child(1) xpathmemb = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and child::caption="Mitglieder"]/child::tr/child::td/child::input[string-length(@value)!=0]/@value' item['members'] = hxs.xpath(xpathmemb).extract() # table.settingsBox:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) xpathprettyname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and position()=1]/tr[position()=1]/td[position()=2]/input/@value' item['groupprettyname']= hxs.xpath(xpathprettyname).extract() yield item