{{tag> belwue mailingliste xpath}} ====== Mailinglisten bei Belwue extrahieren ====== **Problem**: Wir hosten Mails bei belwue, legen dort Mailinglisten an und ändern sie. Ich möchte aber auch diese Listen intern "veröffentlichen". Belwue lässt keinen Export der Daten zu. Abtippen ist zu mühsam. **Lösung**: Skript, das sich die Daten bei Belwue über das "administrator"-Konto holt. ===== Scrapy in ubuntu 12.04 ===== * Follow to install latest scrapy: http://doc.scrapy.org/en/latest/topics/ubuntu.html#topics-ubuntu sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 627220E7 echo 'deb http://archive.scrapy.org/ubuntu scrapy main' | sudo tee /etc/apt/sources.list.d/scrapy.list sudo apt-get update && sudo apt-get install scrapy-0.24 * Start project: # scrapy startproject belwue # cd belwue/ # scrapy genspider maillisten mbox1.belwue.de * Later execute at that point: # scrapy crawl maillisten ===== Skripte ===== Benötigte Dateien, die durch "startproject" entstehen: ./belwue ./belwue/spiders ./belwue/spiders/__init__.py <- leer ./belwue/spiders/maillisten.py ./belwue/settings.py ./belwue/items.py ./belwue/pipelines.py ./belwue/scrape_and_upload.sh <- selbst erstellt für upload ./belwue/__init__.py <- leer ./scrapy.cfg ==== settings ==== # -*- coding: utf-8 -*- BOT_NAME = 'belwue' SPIDER_MODULES = ['belwue.spiders'] NEWSPIDER_MODULE = 'belwue.spiders' ITEM_PIPELINES = {'belwue.pipelines.BelwuePipeline': 1} emaildomain = u"@humboldt-ka.de" admindomain = u"humboldt-gymnasium-karlsruhe.de" password = "xxxxxxxx" ==== items ==== # -*- coding: utf-8 -*- import scrapy class BelwueItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() groupname = scrapy.Field() groupprettyname = scrapy.Field() groupsize = scrapy.Field() grouplink = scrapy.Field() members = scrapy.Field() pass ==== pipeline ==== # -*- coding: utf-8 -*- from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.contrib.exporter import BaseItemExporter import time from belwue.settings import admindomain,emaildomain class WikiItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs, dont_fail=True) self.output = file def _write_heading(self, string, output, which=1): output.write("="*which+string.encode('utf8')+"="*which+"\n") def _write_bullet(self, string, output, indent=1): output.write(" "*indent+"* "+string.encode('utf8')+"\n") def _encode_wikilink(self, linkstring, string="Link"): return "[["+linkstring+u"|"+string+"]]" class MyWikiItemExporter(WikiItemExporter): def __init__(self, header, body, **kwargs): self._configure(kwargs, dont_fail=True) self.body = body self.header = header def start_exporting(self): self._write_heading(u"Aktueller Stand der Verteilerlisten", self.header, 6) self.header.write(u"**Stand: "+time.strftime('%d.%b %Y')+u"**\n") self._write_heading(u"Alle Verteilerlisten", self.header, 5) def finish_exporting(self): pass def export_item(self, item): # print overview to header bullet = self._encode_wikilink(item['groupname']+emaildomain, item['groupprettyname'][0])\ +u": Mitglieder "\ +item['groupsize']\ +u" - " +\ self._encode_wikilink(u"https://mbox1.belwue.de:9010/DomainAdmin/"+admindomain+"/"+item['grouplink'], u"Administratorlink zur Verwaltung") self._write_bullet(bullet, self.header) # print body self._write_heading(item['groupprettyname'][0],self.body, 5) for member in item['members']: bullet = self._encode_wikilink(member+emaildomain, u"") self._write_bullet(bullet, self.body) self.body.write("\n") class BelwuePipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) def spider_opened(self, spider): self.header = open('heading.txt', 'w+b') self.body = open('body.txt','w+b') self.exporter = MyWikiItemExporter(self.header, self.body) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.header.close() self.body.close() def process_item(self, item, spider): self.exporter.export_item(item) return item ==== spider ==== # -*- coding: utf-8 -*- from scrapy import Spider from belwue.items import BelwueItem from scrapy.selector import Selector from scrapy.http import Request from urlparse import urljoin from belwue.settings import admindomain from belwue.settings import password class MaillistenSpider(Spider): name = "maillisten" allowed_domains = ["mbox1.belwue.de"] http_user = u"admin" http_pass = password start_urls = [ u"https://mbox1.belwue.de:9010/DomainAdmin/"+admindomain+u"/ObjectList.html?InCluster=1&domainName="+admindomain+u"&" ] def parse(self, response): #filename = response.url.split("/")[-2] #with open(filename, 'wb') as f: # f.write(response.body) items = [] links = [] hxs = Selector(response) xpath = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]' xpathlink = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/@href' links = hxs.xpath(xpathlink).extract() xpathname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/text()' names = hxs.xpath(xpathname).extract() xpathsize = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td[position()=3]/text()' sizes = hxs.xpath(xpathsize).extract() for i in range(0,len(names)): # for i in range(0,1): # gefaehrlich, wenn irgendwo links, names oder sizes nicht richtig abgerufen werden -> ungleiche Groessen item = BelwueItem() item['grouplink'] = links[i] item['groupname'] = names[i] item['groupsize'] = sizes[i] items.append(item) #yield item yield Request(urljoin(response.url, links[i]), meta={'item': item}, callback=self.parse_job) def parse_job(self, response): hxs = Selector(response) item = response.request.meta['item'] # table.settingsBox:nth-child(5) > tbody:nth-child(2) > tr:nth-child(1) > td:nth-child(1) > input:nth-child(1) xpathmemb = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and child::caption="Mitglieder"]/child::tr/child::td/child::input[string-length(@value)!=0]/@value' item['members'] = hxs.xpath(xpathmemb).extract() # table.settingsBox:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) xpathprettyname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and position()=1]/tr[position()=1]/td[position()=2]/input/@value' item['groupprettyname']= hxs.xpath(xpathprettyname).extract() yield item ==== deployment config ==== # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # http://doc.scrapy.org/en/latest/topics/scrapyd.html [settings] default = belwue.settings [deploy] #url = http://localhost:6800/ project = belwue ==== upload skript ==== #!/bin/bash target="ab12345@pubwww5.belwue.de:/srv/www/virtual/23483/www.humboldt-gymnasium-karlsruhe.de/vhostdata/htdoc/portfolio/data/pages/it-document/mailserver" key="/home/user/sshkey.rsa" scrapy crawl maillisten cat heading.txt body.txt > maillisten.txt rsync -a -e "ssh -o ConnectTimeout=5 -i ${key}" maillisten.txt ${target}/verteilerlisten.txt rm heading.txt body.txt maillisten.txt ===== Linksammlung ===== == Referenz xpath == * http://www.mulberrytech.com/quickref/XSLT_1quickref-v2.pdf * http://www.w3schools.com/XPath/xpath_axes.asp == groking xpath == * http://stackoverflow.com/questions/5580372/testing-for-an-xml-attribute * http://stackoverflow.com/questions/4835891/how-to-extract-attribute-s-value-through-xpath * http://doc.scrapy.org/en/latest/topics/selectors.html * http://doc.scrapy.org/en/latest/topics/request-response.html * https://scrapy.readthedocs.org/en/latest/topics/link-extractors.html == Recursive crawling== * http://stackoverflow.com/questions/13910357/how-can-i-use-multiple-requests-and-pass-items-in-between-them-in-scrapy-python/13911764#13911764 * http://stackoverflow.com/questions/11150053/scrapy-crawl-multiple-pages-per-item * http://stackoverflow.com/questions/9334522/scrapy-follow-link-to-get-additional-item-data == Exporting == * https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/exporter/__init__.py * http://stackoverflow.com/a/12394371 * http://stackoverflow.com/questions/12230332/how-can-scrapy-export-items-to-separate-csv-files-per-item * http://doc.scrapy.org/en/latest/topics/feed-exports.html#topics-feed-exports