例のURLを使用すると、HeadRowMenu
からすべてのURLを取得し、ループを使用して各ページからすべてのh1を抽出できます。
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "http://dsv.su.se/en"
base = "http://dsv.su.se"
def crawl(start, base):
r = requests.get(start)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
menu_links = [urljoin(base, a["href"]) for a in soup.select("#HeadRowMenu a")][1:]
for h in hs:
yield soup.find_all(h)
for lnk in menu_links:
soup = BeautifulSoup(requests.get(lnk).content)
for h in hs:
yield soup.find_all(h)
我々はそれを実行した場合:
In [17]: print(list(chain.from_iterable(crawl(url, base))))
[<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>, <h1>
<a href="/en/about/news/improve-your-digital-competences-on-line-with-eskills-match-1.278510">Improve your digital competences on-line with eSkills Match</a>
</h1>, <h1>
<a href="/en/about/news/envisioning-equality-in-computer-science-tomorrow-today-1.272045">Envisioning Equality in Computer Science - Tomorrow Today</a>
</h1>, <h1>
<a href="/en/about/news/egovlab-develops-online-democracy-1.271839">eGovlab develops online democracy</a>
</h1>, <h1>
<a href="/en/about/events/vinnova-and-dsv-invite-you-to-a-seminar-about-horizon-2020-1.266104">Vinnova and DSV invite you to a seminar about Horizon 2020</a>
</h1>, <h1>
<a href="/en/about/news/significant-increase-of-applicants-for-international-programmes-1.265744">Significant increase of applicants for international programmes</a>
</h1>, <h1>News</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>, <h1 class="visuallyhidden">Education</h1>, <h1>Welcome to the education web at DSV!</h1>, <h1>Master's Programmes at DSV</h1>, <h2>
Master's Programmes in English:</h2>, <h1 class="visuallyhidden">Research</h1>, <h1>Research highlights</h1>, <h2>Research news</h2>, <h1 class="visuallyhidden">About us</h1>, <h1>About DSV</h1>, <h2>Sweden's oldest IT department</h2>, <h2>Interdisciplinary education and research</h2>, <h2>Right in the middle of one of the world's leading ICT clusters</h2>, <h1 class="visuallyhidden">Internal</h1>, <h1>Internal</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>]
あなたは文字通りあなたがscrapyをご覧くださいサイト上のすべてのリンクをこすりしたい場合は、やみくもにすべてのリンクを訪れることができないとして、それは簡単ではありませんidは実行します文字通りどこでもあなたを連想させることができ、無限に繰り返すことができます。あなたは、あなたがあなたが非常に簡単に治療でやり遂げることができるあなたが望むドメインを訪問していることを確認する必要があります。 crawlspiderをご覧ください。
あなた自身をロールバックする:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
for lnk in menu_links:
yield from self.crawl(lnk)
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
page_links = (a["href"] for a in soup.select("a[href]"))
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
for lnk in filter(lambda link: link.startswith("http"), joined):
if lnk not in self.crawled:
soup = BeautifulSoup(requests.get(lnk).content,"lxml")
for h in hs:
yield soup.find_all(h)
self.crawled.add(lnk)
サンプル実行:あなたはより深く行きたい場合は
In [2]: from itertools import chain
In [3]: url = "http://dsv.su.se/en"
In [4]: base = "http://dsv.su.se"
In [5]: crawler = Crawl(url, "dsv.su.se", base, "#HeadRowMenu a")
In [6]: for h in chain.from_iterable(crawler.start()):
...: print(h)
...:
<h1 class="visuallyhidden">Institutionen för data- och systemvetenskap</h1>
<h1>
<a href="/omdsv/evenemang/dsv-50-%C3%A5r-digitala-aff%C3%A4rer-%C3%B6ppet-jubileumsseminarium-1.274298">*DSV 50 år* - Digitala affärer - öppet jubileumsseminarium </a>
</h1>
<h1>
<a href="/omdsv/nyheter/premi%C3%A4r-f%C3%B6r-vandringsdramat-exil-fria-poeter-p%C3%A5-flykt-1.278502">Premiär för vandringsdramat Exil - fria poeter på flykt</a>
</h1>
<h1>
<a href="/omdsv/nyheter/nu-b%C3%B6r-det-st%C3%A5-klart-att-n%C3%A5got-m%C3%A5ste-g%C3%B6ras-1.277680">Nu bör det stå klart att något måste göras </a>
</h1>
<h1>
<a href="/omdsv/nyheter/hur-enkelt-%C3%A4r-det-f%C3%B6r-fbi-att-kn%C3%A4cka-en-iphone-utan-apples-hj%C3%A4lp-1.277546">Hur enkelt är det för FBI att knäcka en Iphone utan Apples hjälp?</a>
</h1>
<h1>
<a href="/omdsv/nyheter/1-av-2-vill-l%C3%A5ta-staten-hacka-sig-in-i-datorer-1.277367">Svårt att backa tillbaka från ökad övervakning</a>
</h1>
<h1>Senaste nyheterna</h1>
<h2 class="category">Kommande evenemang</h2>
<h2>Information inför terminsstart</h2>
<h1 class="visuallyhidden">Other languages</h1>
<h1>Other languages</h1>
<h2>
Information in Chinese and Russian</h2>
<h2>Contact The Administration of Studies</h2>
<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>
<h1>
<a href="/en/about/news/improve-your-digital-competences-on-line-with-eskills-match-1.278510">Improve your digital competences on-line with eSkills Match</a>
</h1>
<h1>
<a href="/en/about/news/envisioning-equality-in-computer-science-tomorrow-today-1.272045">Envisioning Equality in Computer Science - Tomorrow Today</a>
</h1>
<h1>
<a href="/en/about/news/egovlab-develops-online-democracy-1.271839">eGovlab develops online democracy</a>
</h1>
<h1>
<a href="/en/about/events/vinnova-and-dsv-invite-you-to-a-seminar-about-horizon-2020-1.266104">Vinnova and DSV invite you to a seminar about Horizon 2020</a>
</h1>
<h1>
<a href="/en/about/news/significant-increase-of-applicants-for-international-programmes-1.265744">Significant increase of applicants for international programmes</a>
</h1>
<h1>News</h1>
<h2>Semester start information</h2>
<h2>Meet our students</h2>
...................................
明らかに、より多くのロジック1つの構造内のすべてのリンクを格納してまでループを追加する必要がありますそれは空です。 URL内のdsv.su.se
を持っていますが、そうしばらく待つように調製することがこすりへのリンクがたくさんあると警告するサイト上のすべてのリンクを訪問する
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
self.urls = set()
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
print(menu_links)
for lnk in menu_links:
yield from self.crawl(lnk)
def filter_urls(self, soup):
page_links = [a["href"] for a in soup.select("a[href]")]
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
return set(filter(lambda lnk: self.allowed_domain in lnk, joined))
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
self.urls.update(self.filter_urls(soup))
while self.urls:
nxt = self.urls.pop()
if nxt not in self.crawled:
try:
soup = BeautifulSoup(requests.get(nxt).content, "lxml")
except requests.exceptions.RequestException as e:
print(e.strerror)
self.crawled.add(nxt)
continue
self.urls.update((self.filter_urls(soup) - self.crawled))
for h in hs:
yield soup.find_all(h)
self.crawled.add(nxt)
sleep(.1)
:ような何か。