Scraping wunderground without API, using python

They have added some additional tables at the top, just searching with table will not work now, I have used the class selector with the class name to fetch the record, it's working fine

tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mat-table.cdk-table.mat-sort.ng-star-inserted")))

Another direction: Use the API calls that the website is doing.

(The HTTP call was taken from Chrome developer tools)

Example:

HTTP GET https://api-ak.wunderground.com/api/d8585d80376a429e/history_20180812/lang:EN/units:english/bestfct:1/v:2.0/q/HSSS.json?showObs=0&ttl=120

Response

{
    "response": {
        "version": "2.0",
        "units": "english",
        "termsofService": "https://www.wunderground.com/weather/api/d/terms.html",
        "attribution": {
        "image":"//icons.wxug.com/graphics/wu2/logo_130x80.png",
        "title":"Weather Underground",
        "link":"http://www.wunderground.com"
        },
        "features": {
        "history": 1
        }
        , "location": {
        "name": "Khartoum",
        "neighborhood":null,
        "city": "Khartoum",
        "state": null,
        "state_name":"Sudan",
        "country": "SD",
        "country_iso3166":"SA",
        "country_name":"Saudi Arabia",
        "continent":"AS",
        "zip":"00000",
        "magic":"474",
        "wmo":"62721",
        "radarcode":"xxx",
        "radarregion_ic":null,
        "radarregion_link": "//",
        "latitude":15.60000038,
        "longitude":32.54999924,
        "elevation":null,
        "wfo": null,
        "l": "/q/zmw:00000.474.62721",
        "canonical": "/weather/sa/khartoum"
        },
        "date": {
    "epoch": 1553287561,
    "pretty": "11:46 PM EAT on March 22, 2019",
    "rfc822": "Fri, 22 Mar 2019 23:46:01 +0300",
    "iso8601": "2019-03-22T23:46:01+0300",
    "year": 2019,
    "month": 3,
    "day": 22,
    "yday": 80,
    "hour": 23,
    "min": "46",
    "sec": 1,
    "monthname": "March",
    "monthname_short": "Mar",
    "weekday": "Friday",
    "weekday_short": "Fri",
    "ampm": "PM",
    "tz_short": "EAT",
    "tz_long": "Africa/Khartoum",
    "tz_offset_text": "+0300",
    "tz_offset_hours": 3.00
}
    }
        ,
"history": {
    "start_date": {
    "epoch": 1534064400,
    "pretty": "12:00 PM EAT on August 12, 2018",
    "rfc822": "Sun, 12 Aug 2018 12:00:00 +0300",
    "iso8601": "2018-08-12T12:00:00+0300",
    "year": 2018,
    "month": 8,
    "day": 12,
    "yday": 223,
    "hour": 12,
    "min": "00",
    "sec": 0,
    "monthname": "August",
    "monthname_short": "Aug",
    "weekday": "Sunday",
    "weekday_short": "Sun",
    "ampm": "PM",
    "tz_short": "EAT",
    "tz_long": "Africa/Khartoum",
    "tz_offset_text": "+0300",
    "tz_offset_hours": 3.00
},
    "end_date": {
    "epoch": null,
    "pretty": null,
    "rfc822": null,
    "iso8601": null,
    "year": null,
    "month": null,
    "day": null,
    "yday": null,
    "hour": null,
    "min": null,
    "sec": null,
    "monthname": null,
    "monthname_short": null,
    "weekday": null,
    "weekday_short": null,
    "ampm": null,
    "tz_short": null,
    "tz_long": null,
    "tz_offset_text": null,
    "tz_offset_hours": null
},
    "days": [
        {
        "summary": {
        "date": {
    "epoch": 1534021200,
    "pretty": "12:00 AM EAT on August 12, 2018",
    "rfc822": "Sun, 12 Aug 2018 00:00:00 +0300",
    "iso8601": "2018-08-12T00:00:00+0300",
    "year": 2018,
    "month": 8,
    "day": 12,
    "yday": 223,
    "hour": 0,
    "min": "00",
    "sec": 0,
    "monthname": "August",
    "monthname_short": "Aug",
    "weekday": "Sunday",
    "weekday_short": "Sun",
    "ampm": "AM",
    "tz_short": "EAT",
    "tz_long": "Africa/Khartoum",
    "tz_offset_text": "+0300",
    "tz_offset_hours": 3.00
},
        "temperature": 82,
    "dewpoint": 66,
    "pressure": 29.94,
    "wind_speed": 11,
    "wind_dir": "SSE",
    "wind_dir_degrees": 166,
    "visibility": 5.9,
    "humidity": 57,
    "max_temperature": 89,
    "min_temperature": 75,
    "temperature_normal": null,
    "min_temperature_normal": null,
    "max_temperature_normal": null,
    "min_temperature_record": null,
    "max_temperature_record": null,
    "min_temperature_record_year": null,
    "max_temperature_record_year": null,
    "max_humidity": 83,
    "min_humidity": 40,
    "max_dewpoint": 70,
    "min_dewpoint": 63,
    "max_pressure": 29.98,
    "min_pressure": 29.89,
    "max_wind_speed": 22,
    "min_wind_speed": 5,
    "max_visibility": 6.2,
    "min_visibility": 1.9,
    "fog": 0,
    "hail": 0,
    "snow": 0,
    "rain": 1,
    "thunder": 0,
    "tornado": 0,
    "snowfall": null,
    "monthtodatesnowfall": null,
    "since1julsnowfall": null,
    "snowdepth": null,
    "precip": 0.00,
    "preciprecord": null,
    "preciprecordyear": null,
    "precipnormal": null,
    "since1janprecipitation": null,
    "since1janprecipitationnormal": null,
    "monthtodateprecipitation": null,
    "monthtodateprecipitationnormal": null,
    "precipsource": "3Or6HourObs",
    "gdegreedays": 32,
    "heatingdegreedays": 0,
    "coolingdegreedays": 17,
    "heatingdegreedaysnormal": null,
    "monthtodateheatingdegreedays": null,
    "monthtodateheatingdegreedaysnormal": null,
    "since1sepheatingdegreedays": null,
    "since1sepheatingdegreedaysnormal": null,
    "since1julheatingdegreedays": null,
    "since1julheatingdegreedaysnormal": null,
    "coolingdegreedaysnormal": null,
    "monthtodatecoolingdegreedays": null,
    "monthtodatecoolingdegreedaysnormal": null,
    "since1sepcoolingdegreedays": null,
    "since1sepcoolingdegreedaysnormal": null,
    "since1jancoolingdegreedays": null,
    "since1jancoolingdegreedaysnormal": null
,
        "avgoktas": 5,
        "icon": "rain"
        }
        }
    ]
}
}

I do it in the following manner.

I open developer tools using Ctrl+Shift+I Then I submit a request via the website while recording the transactions (In this case, you just click on the View button. Then I filter those for XHR. enter image description here

In the requests that remain, I go over the responses for each request that remain. The response that looked like the one I wanted I take its request URL and use it. It might be best to copy the response to a separate JSON file and beautify it so it's easy to read and determine if that's what you want.

In my scenario, my request URL was a get request to the following https://api.weather.com/v1/location/OLBA:9:LB/observations/historical.json?apiKey=_____________&units=e&startDate=20200305

I removed the API KEY from the URL above in order for me to use it

When you paste the URL into a browser you should get the same response and then you can use Python requests package to get the response and just parse the JSON.

you could use selenium to ensure page load then pandas read_html to get tables

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

url = 'https://www.wunderground.com/history/daily/sd/khartoum/HSSS/date/2019-03-12'
driver = webdriver.Chrome()
driver.get(url)
tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
for table in tables:
    newTable = pd.read_html(table.get_attribute('outerHTML'))
    if newTable:
        print(newTable[0].fillna(''))

Scraping wunderground without API, using python

Tags:

Python

Web Scraping

Wunderground

Related

Recent Posts