how to scrape data from a website code example

Example 1: BeautifulSoup - scraping the link of the website

import requests
from bs4 import BeautifulSoup

page = requests.get('http://www.example.com')
soup = BeautifulSoup(page.content, 'html.parser')

print(soup.select_one('p a').attrs['href'])  # get the link of the website

Example 2: web scraping python

#pip install beautifulsoup4

import os
import requests
from bs4 import BeautifulSoup

url = "https://www.google.com/"
reponse = requests.get(url)

if reponse.ok:
	soup = BeautifulSoup(reponse.text, "lxml")
	title = str(soup.find("title"))

	title = title.replace("<title>", "")
	title = title.replace("</title>", "")
	print("The title is : " + str(title))

os.system("pause")

#python (code name).py

Example 3: web scraping

#pip install beautifulsoup4
#python :

import os
import requests
from bs4 import BeautifulSoup

url = "https://www.google.com/"
reponse = requests.get(url)

if reponse.ok:
	soup = BeautifulSoup(reponse.text, "lxml")
	title = str(soup.find("title"))

	title = title.replace("<title>", "")
	title = title.replace("</title>", "")
	print("The title is : " + str(title))

os.system("pause")

#python (code name).py

Example 4: how to scrape data from a html page saved locally

from bs4 import BeautifulSoup
import html5lib
myFile=open('C:/Users/CSE/AppData/Local/atom/app-1.42.0/practise.html','r')
soup=BeautifulSoup(myFile,"html5lib")
print(soup.prettify())

Example 5: web scraper python

def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [a for a in html.select('a')
                    if a['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

Example 6: web scraper python

>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
>>> html = BeautifulSoup(raw_html, 'html.parser')
>>> for i, li in enumerate(html.select('li')):
        print(i, li.text)

0  Isaac Newton
 Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

1  Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

2  Carl F. Gauss
 Leonhard Euler 
 Bernhard Riemann

 3  Leonhard Euler
 Bernhard Riemann

4  Bernhard Riemann

# 5 ... and many more...