Scrape wikipedia for microprocessors
- https://en.wikipedia.org/wiki/Transistor_count
- 'Moore's law is the observation that the number of transistors in a dense integrated circuit doubles about every two years' - wiki

In [186]:
# // 1.  Import packages that we need:
import numpy as np
import pandas as pd
# // Web scraping: 
import requests
import string
from bs4 import BeautifulSoup
# // OS. Sometimes need this for finding working directory:
import os
# // datetime
from datetime import datetime
# // regex library used to detect the presence of particular characters (eg extarcting numbers from string)
import re
from pprint import pprint 

# // altair + practice datasets


In [187]:
# Scrape with Beautiful Soup

URL = "https://en.wikipedia.org/wiki/Transistor_count"

# // Request the html from the URL:
html = requests.get(URL)

# // Get the soup of this page
soup = BeautifulSoup(html.content, 'html.parser')

Notes:
- tried many iterations of cleaning unnecessary text from data, multiple instances where one row is formatted slightly differently leading to a incorrect cleaning,
    - eg, to clean 'mm2' from each number in the area column, this worked for all but 1 observation which had its units formatted differently
        - area data as ' mm2' attached in multiple formats, so extract numbers (including decimals).
        row[5] = re.sub("[^\d\.]", "", row[5])
        - then select all but the last character, which always be '2' from the mm2. 
        row[5] = row[5][:-1]
    - worked by instead splicing on 'm' then removing any empty space

In [224]:
# // find all tables
tables = soup.find_all('table', class_='wikitable sortable')

# loop through each <table> in soup
for table in tables:
    # only create dataframe of microprocessor data
    if 'Apple M1' in table.text:
        headers = [header.text.strip() for header in table.find_all('th')]
        table_rows = table.find_all('tr')    
        rows = []
        # skip first row as only contains header tags
        for row in table_rows[1:]:
            td = row.find_all('td')
            row = [row.text.strip() for row in td]

            # remove any observations not containing area of chip (as would not be able to calculate density)
            # code would break if for some reason a scraped row did not contain 6 obs, so remove any which do (currenly 1)
            if len(row) < 6:
                row.clear()
            elif len(row) == 6:
                # cleans data of any references and extra specs by removing any thing in brackets
                row = [re.sub("\(.*?\)","", i) for i in row]
                row = [re.sub("\[.*?\]","", i) for i in row]
                # removes any non-numbers from transistor count column
                row[1] = re.sub("[^0-9]", "", row[1])
                # only proceeds with processors that have a data point for Area
                if (row[5] != "?") and (row[5] != ""):
                    # Split the string on 'm' and return the first part. 
                    row[5] = row[5].split('m')[0]
                    # area data as ' mm2' attached in multiple formats, so extract numbers (including decimals)
                    row[5] = re.sub("[^\d\.]", "", row[5])
                    # change formats to int/float/year
                    row[1] = int(row[1])
                    row[2] = datetime.strptime(row[2], '%Y')
                    row[5] = float(row[5]) if '.' in row[5] else int(row[5])
                    # add to master array
                    rows.append(row)

# // create dataframe with 'headers' list as column headers
df = pd.DataFrame(rows, columns=headers)

# // rename columns
df.columns = ['Processor', 'Transistors', 'Year', 'Designer', 'Process', 'Area mm2']

# // some amd processors listed as two MOS processes, all launched on 7nm so edit to specify
df['Process'] = df['Process'].str.replace(' & 12', '')

# // calculate transistor density
df['Transistors/mm2'] = df['Transistors']/df['Area mm2']

# // change type to integer for better display
df = df.astype({"Transistors/mm2": int, "Area mm2": int})

df

Unnamed: 0,Processor,Transistors,Year,Designer,Process,Area mm2,Transistors/mm2
0,Intel 4004,2250,1971-01-01,Intel,"10,000 nm",12,187
1,TMX 1795,3078,1971-01-01,Texas Instruments,?,30,102
2,Intel 8008,3500,1972-01-01,Intel,"10,000 nm",14,250
3,Toshiba TLCS-12,11000,1973-01-01,Toshiba,"6,000 nm",32,343
4,Intel 4040,3000,1974-01-01,Intel,"10,000 nm",12,250
...,...,...,...,...,...,...,...
159,HiSilicon Kirin 9000,15300000000,2020-01-01,Huawei,5 nm,114,134210526
160,Apple A15,15000000000,2021-01-01,Apple,5 nm,107,139301634
161,AMD Ryzen 7 5800H,10700000000,2021-01-01,AMD,7 nm,180,59444444
162,Apple M1 Pro,33700000000,2021-01-01,Apple,5 nm,245,137551020


In [241]:
# // create growth rate series to add to graph
r = (df['Transistors/mm2'].max()/df['Transistors/mm2'].iloc[0])**(1/50)
print((r-1)*100,'%') # actual yearly average growth in density density
print((r**2-1)*100,'%') # Biennial growth rate in density density

growth = []
for i in range(1, 51):
    growth.append((df['Transistors/mm2'].iloc[0])*(r**i))

31.051589286638936 %
71.74519054553899 %


In [209]:
# export as JSON
df.to_json('/Users/joshhellings/Documents/OneDrive - University of Bristol/Economics Year 3/Data Science/Github Mirror/Project/Data/wikiScrape_Processors.JSON', orient='records')

** Code can be re-run as contributors add recent data