1  Web Scraping Wiki tables with Beautiful Soup


April 7, 2023

Web Scrap and extracting list of municipalities, ghost towns, and unincorporated places in Texas.

Packages needed

import requests
import pandas as pd
import json 
from bs4 import BeautifulSoup 

List of Municipalities in Texas

def table_extraction(a,b):
    url= a 
    file = f"list of texas places/{b}.txt" 
    s = requests.get(url)
    soup= BeautifulSoup(s.text, "html.parser") #to tell the program to read the site as HTML (lxml)
    table = soup.find("table", class_="wikitable sortable")
    with open (file, "w", encoding="utf-8")as f: 
        #headerString  = ""
        dataString = ""
        for row in table.find_all("tr"):   
            #for th in row.find_all("th"): 
                #headerString  = headerString+th.get_text(strip=True)+"|"
                #ending a line break to organize the txt file
            for td in row.find_all("td"):
                dataString = dataString+td.get_text(strip=True)+"|"
            dataString=dataString + "\n"

def do_all(a, b):
do_all("https://en.wikipedia.org/wiki/List_of_municipalities_in_Texas","List of municipalities in Texas")

List of Unincorporated Communities in Texas

def table_extraction(a,b):
    url= a 
    file = f"list of texas places/{b}.txt" 
    s = requests.get(url)
    soup= BeautifulSoup(s.text, "html.parser") #to tell the program to read the site as HTML (lxml)
    table = soup.find("table", class_="wikitable sortable mw-collapsible")
    with open (file, "w", encoding="utf-8")as f: 
        #headerString  = ""
        dataString = ""
        for row in table.find_all("tr"):   
            #for th in row.find_all("th"): 
                #headerString  = headerString+th.get_text(strip=True)+"|"
                #ending a line break to organize the txt file
            for td in row.find_all("td"):
                dataString = dataString+td.get_text(strip=True)+"|"
            dataString=dataString + "\n"

def do_all(a, b):

do_all("https://en.wikipedia.org/wiki/List_of_unincorporated_communities_in_Texas", "List of unincorporated communities in Texas")

List of Ghost towns in Texas

def table_extraction(a,b):
    url= a 
    file = f"list of texas places/{b}.txt" 
    s = requests.get(url)
    soup= BeautifulSoup(s.text, "html.parser") #to tell the program to read the site as HTML (lxml)
    table = soup.find("table", class_="wikitable sortable")
    with open (file, "w", encoding="utf-8")as f: 
        #headerString  = ""
        dataString = ""
        for row in table.find_all("tr"):   
            #for th in row.find_all("th"): 
                #headerString  = headerString+th.get_text(strip=True)+"|"
                #ending a line break to organize the txt file
            for td in row.find_all("td"):
                dataString = dataString+td.get_text(strip=True)+"|"
            dataString=dataString + "\n"

def do_all(a, b):

do_all("https://en.wikipedia.org/wiki/List_of_ghost_towns_in_Texas", "List of ghost towns in Texas")