{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Scrape wikipedia for microprocessors\n",
    "- https://en.wikipedia.org/wiki/Transistor_count\n",
    "- 'Moore's law is the observation that the number of transistors in a dense integrated circuit doubles about every two years' - wiki"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "# // 1.  Import packages that we need:\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "# // Web scraping: \n",
    "import requests\n",
    "import string\n",
    "from bs4 import BeautifulSoup\n",
    "# // OS. Sometimes need this for finding working directory:\n",
    "import os\n",
    "# // datetime\n",
    "from datetime import datetime\n",
    "# // regex library used to detect the presence of particular characters (eg extarcting numbers from string)\n",
    "import re\n",
    "from pprint import pprint \n",
    "\n",
    "# // altair + practice datasets\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scrape with Beautiful Soup\n",
    "\n",
    "URL = \"https://en.wikipedia.org/wiki/Transistor_count\"\n",
    "\n",
    "# // Request the html from the URL:\n",
    "html = requests.get(URL)\n",
    "\n",
    "# // Get the soup of this page\n",
    "soup = BeautifulSoup(html.content, 'html.parser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notes:\n",
    "- tried many iterations of cleaning unnecessary text from data, multiple instances where one row is formatted slightly differently leading to a incorrect cleaning,\n",
    "    - eg, to clean 'mm2' from each number in the area column, this worked for all but 1 observation which had its units formatted differently\n",
    "        - area data as ' mm2' attached in multiple formats, so extract numbers (including decimals).\n",
    "        row[5] = re.sub(\"[^\\d\\.]\", \"\", row[5])\n",
    "        - then select all but the last character, which always be '2' from the mm2. \n",
    "        row[5] = row[5][:-1]\n",
    "    - worked by instead splicing on 'm' then removing any empty space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Processor</th>\n",
       "      <th>Transistors</th>\n",
       "      <th>Year</th>\n",
       "      <th>Designer</th>\n",
       "      <th>Process</th>\n",
       "      <th>Area mm2</th>\n",
       "      <th>Transistors/mm2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Intel 4004</td>\n",
       "      <td>2250</td>\n",
       "      <td>1971-01-01</td>\n",
       "      <td>Intel</td>\n",
       "      <td>10,000 nm</td>\n",
       "      <td>12</td>\n",
       "      <td>187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TMX 1795</td>\n",
       "      <td>3078</td>\n",
       "      <td>1971-01-01</td>\n",
       "      <td>Texas Instruments</td>\n",
       "      <td>?</td>\n",
       "      <td>30</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Intel 8008</td>\n",
       "      <td>3500</td>\n",
       "      <td>1972-01-01</td>\n",
       "      <td>Intel</td>\n",
       "      <td>10,000 nm</td>\n",
       "      <td>14</td>\n",
       "      <td>250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Toshiba TLCS-12</td>\n",
       "      <td>11000</td>\n",
       "      <td>1973-01-01</td>\n",
       "      <td>Toshiba</td>\n",
       "      <td>6,000 nm</td>\n",
       "      <td>32</td>\n",
       "      <td>343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Intel 4040</td>\n",
       "      <td>3000</td>\n",
       "      <td>1974-01-01</td>\n",
       "      <td>Intel</td>\n",
       "      <td>10,000 nm</td>\n",
       "      <td>12</td>\n",
       "      <td>250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>HiSilicon Kirin 9000</td>\n",
       "      <td>15300000000</td>\n",
       "      <td>2020-01-01</td>\n",
       "      <td>Huawei</td>\n",
       "      <td>5 nm</td>\n",
       "      <td>114</td>\n",
       "      <td>134210526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>Apple A15</td>\n",
       "      <td>15000000000</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>Apple</td>\n",
       "      <td>5 nm</td>\n",
       "      <td>107</td>\n",
       "      <td>139301634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>161</th>\n",
       "      <td>AMD Ryzen 7 5800H</td>\n",
       "      <td>10700000000</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>AMD</td>\n",
       "      <td>7 nm</td>\n",
       "      <td>180</td>\n",
       "      <td>59444444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162</th>\n",
       "      <td>Apple M1 Pro</td>\n",
       "      <td>33700000000</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>Apple</td>\n",
       "      <td>5 nm</td>\n",
       "      <td>245</td>\n",
       "      <td>137551020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>Apple M1 Max</td>\n",
       "      <td>57000000000</td>\n",
       "      <td>2021-01-01</td>\n",
       "      <td>Apple</td>\n",
       "      <td>5 nm</td>\n",
       "      <td>432</td>\n",
       "      <td>131944444</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>164 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                Processor  Transistors       Year           Designer  \\\n",
       "0             Intel 4004          2250 1971-01-01              Intel   \n",
       "1               TMX 1795          3078 1971-01-01  Texas Instruments   \n",
       "2             Intel 8008          3500 1972-01-01              Intel   \n",
       "3        Toshiba TLCS-12         11000 1973-01-01            Toshiba   \n",
       "4             Intel 4040          3000 1974-01-01              Intel   \n",
       "..                    ...          ...        ...                ...   \n",
       "159  HiSilicon Kirin 9000  15300000000 2020-01-01             Huawei   \n",
       "160             Apple A15  15000000000 2021-01-01              Apple   \n",
       "161    AMD Ryzen 7 5800H   10700000000 2021-01-01                AMD   \n",
       "162         Apple M1 Pro   33700000000 2021-01-01              Apple   \n",
       "163         Apple M1 Max   57000000000 2021-01-01              Apple   \n",
       "\n",
       "       Process  Area mm2  Transistors/mm2  \n",
       "0    10,000 nm        12              187  \n",
       "1            ?        30              102  \n",
       "2    10,000 nm        14              250  \n",
       "3     6,000 nm        32              343  \n",
       "4    10,000 nm        12              250  \n",
       "..         ...       ...              ...  \n",
       "159       5 nm       114        134210526  \n",
       "160       5 nm       107        139301634  \n",
       "161       7 nm       180         59444444  \n",
       "162       5 nm       245        137551020  \n",
       "163       5 nm       432        131944444  \n",
       "\n",
       "[164 rows x 7 columns]"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# // find all tables\n",
    "tables = soup.find_all('table', class_='wikitable sortable')\n",
    "\n",
    "# loop through each <table> in soup\n",
    "for table in tables:\n",
    "    # only create dataframe of microprocessor data\n",
    "    if 'Apple M1' in table.text:\n",
    "        headers = [header.text.strip() for header in table.find_all('th')]\n",
    "        table_rows = table.find_all('tr')    \n",
    "        rows = []\n",
    "        # skip first row as only contains header tags\n",
    "        for row in table_rows[1:]:\n",
    "            td = row.find_all('td')\n",
    "            row = [row.text.strip() for row in td]\n",
    "\n",
    "            # remove any observations not containing area of chip (as would not be able to calculate density)\n",
    "            # code would break if for some reason a scraped row did not contain 6 obs, so remove any which do (currenly 1)\n",
    "            if len(row) < 6:\n",
    "                row.clear()\n",
    "            elif len(row) == 6:\n",
    "                # cleans data of any references and extra specs by removing any thing in brackets\n",
    "                row = [re.sub(\"\\(.*?\\)\",\"\", i) for i in row]\n",
    "                row = [re.sub(\"\\[.*?\\]\",\"\", i) for i in row]\n",
    "                # removes any non-numbers from transistor count column\n",
    "                row[1] = re.sub(\"[^0-9]\", \"\", row[1])\n",
    "                # only proceeds with processors that have a data point for Area\n",
    "                if (row[5] != \"?\") and (row[5] != \"\"):\n",
    "                    # Split the string on 'm' and return the first part. \n",
    "                    row[5] = row[5].split('m')[0]\n",
    "                    # area data as ' mm2' attached in multiple formats, so extract numbers (including decimals)\n",
    "                    row[5] = re.sub(\"[^\\d\\.]\", \"\", row[5])\n",
    "                    # change formats to int/float/year\n",
    "                    row[1] = int(row[1])\n",
    "                    row[2] = datetime.strptime(row[2], '%Y')\n",
    "                    row[5] = float(row[5]) if '.' in row[5] else int(row[5])\n",
    "                    # add to master array\n",
    "                    rows.append(row)\n",
    "\n",
    "# // create dataframe with 'headers' list as column headers\n",
    "df = pd.DataFrame(rows, columns=headers)\n",
    "\n",
    "# // rename columns\n",
    "df.columns = ['Processor', 'Transistors', 'Year', 'Designer', 'Process', 'Area mm2']\n",
    "\n",
    "# // some amd processors listed as two MOS processes, all launched on 7nm so edit to specify\n",
    "df['Process'] = df['Process'].str.replace(' & 12', '')\n",
    "\n",
    "# // calculate transistor density\n",
    "df['Transistors/mm2'] = df['Transistors']/df['Area mm2']\n",
    "\n",
    "# // change type to integer for better display\n",
    "df = df.astype({\"Transistors/mm2\": int, \"Area mm2\": int})\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "31.051589286638936 %\n",
      "71.74519054553899 %\n"
     ]
    }
   ],
   "source": [
    "# // create growth rate series to add to graph\n",
    "r = (df['Transistors/mm2'].max()/df['Transistors/mm2'].iloc[0])**(1/50)\n",
    "print((r-1)*100,'%') # actual yearly average growth in density density\n",
    "print((r**2-1)*100,'%') # Biennial growth rate in density density\n",
    "\n",
    "growth = []\n",
    "for i in range(1, 51):\n",
    "    growth.append((df['Transistors/mm2'].iloc[0])*(r**i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "# export as JSON\n",
    "df.to_json('/Users/joshhellings/Documents/OneDrive - University of Bristol/Economics Year 3/Data Science/Github Mirror/Project/Data/wikiScrape_Processors.JSON', orient='records')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** Code can be re-run as contributors add recent data"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "113903349309e73f3a7191f616987ae20874fe83f45c68fdab869fbcd5d05197"
  },
  "kernelspec": {
   "display_name": "Python 3.8.8 64-bit ('base': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}