{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Scrape wikipedia for microprocessors\n", "- https://en.wikipedia.org/wiki/Transistor_count\n", "- 'Moore's law is the observation that the number of transistors in a dense integrated circuit doubles about every two years' - wiki" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [], "source": [ "# // 1. Import packages that we need:\n", "import numpy as np\n", "import pandas as pd\n", "# // Web scraping: \n", "import requests\n", "import string\n", "from bs4 import BeautifulSoup\n", "# // OS. Sometimes need this for finding working directory:\n", "import os\n", "# // datetime\n", "from datetime import datetime\n", "# // regex library used to detect the presence of particular characters (eg extarcting numbers from string)\n", "import re\n", "from pprint import pprint \n", "\n", "# // altair + practice datasets\n" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "# Scrape with Beautiful Soup\n", "\n", "URL = \"https://en.wikipedia.org/wiki/Transistor_count\"\n", "\n", "# // Request the html from the URL:\n", "html = requests.get(URL)\n", "\n", "# // Get the soup of this page\n", "soup = BeautifulSoup(html.content, 'html.parser')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notes:\n", "- tried many iterations of cleaning unnecessary text from data, multiple instances where one row is formatted slightly differently leading to a incorrect cleaning,\n", " - eg, to clean 'mm2' from each number in the area column, this worked for all but 1 observation which had its units formatted differently\n", " - area data as ' mm2' attached in multiple formats, so extract numbers (including decimals).\n", " row[5] = re.sub(\"[^\\d\\.]\", \"\", row[5])\n", " - then select all but the last character, which always be '2' from the mm2. \n", " row[5] = row[5][:-1]\n", " - worked by instead splicing on 'm' then removing any empty space" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ProcessorTransistorsYearDesignerProcessArea mm2Transistors/mm2
0Intel 400422501971-01-01Intel10,000 nm12187
1TMX 179530781971-01-01Texas Instruments?30102
2Intel 800835001972-01-01Intel10,000 nm14250
3Toshiba TLCS-12110001973-01-01Toshiba6,000 nm32343
4Intel 404030001974-01-01Intel10,000 nm12250
........................
159HiSilicon Kirin 9000153000000002020-01-01Huawei5 nm114134210526
160Apple A15150000000002021-01-01Apple5 nm107139301634
161AMD Ryzen 7 5800H107000000002021-01-01AMD7 nm18059444444
162Apple M1 Pro337000000002021-01-01Apple5 nm245137551020
163Apple M1 Max570000000002021-01-01Apple5 nm432131944444
\n", "

164 rows × 7 columns

\n", "
" ], "text/plain": [ " Processor Transistors Year Designer \\\n", "0 Intel 4004 2250 1971-01-01 Intel \n", "1 TMX 1795 3078 1971-01-01 Texas Instruments \n", "2 Intel 8008 3500 1972-01-01 Intel \n", "3 Toshiba TLCS-12 11000 1973-01-01 Toshiba \n", "4 Intel 4040 3000 1974-01-01 Intel \n", ".. ... ... ... ... \n", "159 HiSilicon Kirin 9000 15300000000 2020-01-01 Huawei \n", "160 Apple A15 15000000000 2021-01-01 Apple \n", "161 AMD Ryzen 7 5800H 10700000000 2021-01-01 AMD \n", "162 Apple M1 Pro 33700000000 2021-01-01 Apple \n", "163 Apple M1 Max 57000000000 2021-01-01 Apple \n", "\n", " Process Area mm2 Transistors/mm2 \n", "0 10,000 nm 12 187 \n", "1 ? 30 102 \n", "2 10,000 nm 14 250 \n", "3 6,000 nm 32 343 \n", "4 10,000 nm 12 250 \n", ".. ... ... ... \n", "159 5 nm 114 134210526 \n", "160 5 nm 107 139301634 \n", "161 7 nm 180 59444444 \n", "162 5 nm 245 137551020 \n", "163 5 nm 432 131944444 \n", "\n", "[164 rows x 7 columns]" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# // find all tables\n", "tables = soup.find_all('table', class_='wikitable sortable')\n", "\n", "# loop through each in soup\n", "for table in tables:\n", " # only create dataframe of microprocessor data\n", " if 'Apple M1' in table.text:\n", " headers = [header.text.strip() for header in table.find_all('th')]\n", " table_rows = table.find_all('tr') \n", " rows = []\n", " # skip first row as only contains header tags\n", " for row in table_rows[1:]:\n", " td = row.find_all('td')\n", " row = [row.text.strip() for row in td]\n", "\n", " # remove any observations not containing area of chip (as would not be able to calculate density)\n", " # code would break if for some reason a scraped row did not contain 6 obs, so remove any which do (currenly 1)\n", " if len(row) < 6:\n", " row.clear()\n", " elif len(row) == 6:\n", " # cleans data of any references and extra specs by removing any thing in brackets\n", " row = [re.sub(\"\\(.*?\\)\",\"\", i) for i in row]\n", " row = [re.sub(\"\\[.*?\\]\",\"\", i) for i in row]\n", " # removes any non-numbers from transistor count column\n", " row[1] = re.sub(\"[^0-9]\", \"\", row[1])\n", " # only proceeds with processors that have a data point for Area\n", " if (row[5] != \"?\") and (row[5] != \"\"):\n", " # Split the string on 'm' and return the first part. \n", " row[5] = row[5].split('m')[0]\n", " # area data as ' mm2' attached in multiple formats, so extract numbers (including decimals)\n", " row[5] = re.sub(\"[^\\d\\.]\", \"\", row[5])\n", " # change formats to int/float/year\n", " row[1] = int(row[1])\n", " row[2] = datetime.strptime(row[2], '%Y')\n", " row[5] = float(row[5]) if '.' in row[5] else int(row[5])\n", " # add to master array\n", " rows.append(row)\n", "\n", "# // create dataframe with 'headers' list as column headers\n", "df = pd.DataFrame(rows, columns=headers)\n", "\n", "# // rename columns\n", "df.columns = ['Processor', 'Transistors', 'Year', 'Designer', 'Process', 'Area mm2']\n", "\n", "# // some amd processors listed as two MOS processes, all launched on 7nm so edit to specify\n", "df['Process'] = df['Process'].str.replace(' & 12', '')\n", "\n", "# // calculate transistor density\n", "df['Transistors/mm2'] = df['Transistors']/df['Area mm2']\n", "\n", "# // change type to integer for better display\n", "df = df.astype({\"Transistors/mm2\": int, \"Area mm2\": int})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 241, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "31.051589286638936 %\n", "71.74519054553899 %\n" ] } ], "source": [ "# // create growth rate series to add to graph\n", "r = (df['Transistors/mm2'].max()/df['Transistors/mm2'].iloc[0])**(1/50)\n", "print((r-1)*100,'%') # actual yearly average growth in density density\n", "print((r**2-1)*100,'%') # Biennial growth rate in density density\n", "\n", "growth = []\n", "for i in range(1, 51):\n", " growth.append((df['Transistors/mm2'].iloc[0])*(r**i))" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [], "source": [ "# export as JSON\n", "df.to_json('/Users/joshhellings/Documents/OneDrive - University of Bristol/Economics Year 3/Data Science/Github Mirror/Project/Data/wikiScrape_Processors.JSON', orient='records')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Code can be re-run as contributors add recent data" ] } ], "metadata": { "interpreter": { "hash": "113903349309e73f3a7191f616987ae20874fe83f45c68fdab869fbcd5d05197" }, "kernelspec": { "display_name": "Python 3.8.8 64-bit ('base': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }