DSAN 5200: Lab 5

Author

Jeff Jacobs

library(tidyverse)
library(lubridate)
library(plotly)
chip_df <- read_csv("data/chip_dataset.csv")
chip_df <- chip_df |>
  rename(
    trs = `Transistors (million)`,
    date = `Release Date`,
    type = `Type`
  ) |>
  mutate(
    date_obj = mdy(date)
  ) |>
  arrange(date_obj, trs)
chip_df$trs <- as.numeric(as.character(chip_df$trs))
chip_df <- chip_df |>
  drop_na(trs, date_obj)
chip_df |> select(Product, date_obj, trs) |> head()
Product date_obj trs
NVIDIA NV5 1999-03-15 15
Intel Pentium III Xeon 800 2.8V 2000-01-12 28
Intel Pentium III Xeon 800 5V-12V 2000-01-12 28
NVIDIA Vanta LT 2000-03-01 15
Sony Playstation 2 GPU 250nm 2000-03-04 54
Sony GS-250nm 2000-03-04 54
chip_df <- chip_df |>
  filter(
    lubridate::year(date_obj) >= 2000
  )
chip_df |> glimpse()
Rows: 3,997
Columns: 14
$ Product             <chr> "Intel Pentium III Xeon 800 2.8V", "Intel Pentium …
$ type                <chr> "CPU", "CPU", "GPU", "GPU", "GPU", "GPU", "GPU", "…
$ date                <chr> "1/12/00", "1/12/00", "3/1/00", "3/4/00", "3/4/00"…
$ `Process Size (nm)` <chr> "180", "180", "250", "250", "250", "180", "180", "…
$ `TDP (W)`           <chr> "25", "25", "unknown", "79", NA, NA, "23", "23", "…
$ `Die Size (mm^2)`   <chr> "106", "106", "63", "279", "279", "111", "111", "1…
$ trs                 <dbl> 28, 28, 15, 54, 54, 30, 30, 30, 30, 30, 28, 28, 25…
$ `Freq (GHz)`        <dbl> 800, 800, 105, 147, NA, NA, 166, 166, 166, NA, 866…
$ Foundry             <chr> "Intel", "Intel", "TSMC", "Sony", "Sony", "TSMC", …
$ Vendor              <chr> "Intel", "Intel", "NVIDIA", "Sony", "Sony", "ATI",…
$ `FP16 GFLOPS`       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `FP32 GFLOPS`       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `FP64 GFLOPS`       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ date_obj            <date> 2000-01-12, 2000-01-12, 2000-03-01, 2000-03-04, 2…
sum(is.na(chip_df$date_obj))
[1] 0
chip_df <- chip_df |>
  mutate(
    yr = lubridate::year(date_obj)
  )
chip_df |> select(date_obj, trs, yr) |> head()
date_obj trs yr
2000-01-12 28 2000
2000-01-12 28 2000
2000-03-01 15 2000
2000-03-04 54 2000
2000-03-04 54 2000
2000-04-01 30 2000
by_year_plot <- plot_ly(
  chip_df,
  x=~yr, y=~trs, type='scatter', mode='markers+lines'
)
by_year_plot
yearly_mean_df <- chip_df |>
  group_by(yr) |>
  summarize(
    yr_mean_trs = mean(trs)
  )
#yearly_mean_df |> head()
yearly_mean_plot <- plot_ly(
  yearly_mean_df, x=~yr, y=~yr_mean_trs
)
yearly_mean_plot |>
  add_markers() |>
  add_lines()
yearly_mean_type_df <- chip_df |>
  group_by(yr, type) |>
  summarize(
    yr_mean_trs = mean(trs)
  ) |>
  ungroup()
`summarise()` has grouped output by 'yr'. You can override using the `.groups`
argument.
# yearly_mean_type_df |> head()
yearly_mean_type_df |>
  plot_ly(
    x=~yr, y=~yr_mean_trs,
    color=~type, colors=c('orange','lightblue')
  ) |>
  add_lines() |>
  add_markers()
yearly_mean_type_df |>
  plot_ly(
    x=~yr, y=~yr_mean_trs,
    color=~type, colors=c('orange','lightblue')
  ) |>
  add_trace(type='scatter', mode = 'lines+markers')

Onto Python!

import pandas as pd
import numpy as np
import plotly
print("Hello world")
Hello world
chip_df = pd.read_csv("data/chip_dataset.csv")
print(chip_df.columns)
Index(['Product', 'Type', 'Release Date', 'Process Size (nm)', 'TDP (W)',
       'Die Size (mm^2)', 'Transistors (million)', 'Freq (GHz)', 'Foundry',
       'Vendor', 'FP16 GFLOPS', 'FP32 GFLOPS', 'FP64 GFLOPS'],
      dtype='object')
chip_df.head()
           Product Type Release Date  ... FP16 GFLOPS FP32 GFLOPS FP64 GFLOPS
0  AMD Athlon 1000  CPU       6/5/00  ...         NaN         NaN         NaN
1  AMD Athlon 1000  CPU     10/31/00  ...         NaN         NaN         NaN
2  AMD Athlon 1100  CPU      8/14/00  ...         NaN         NaN         NaN
3  AMD Athlon 1133  CPU     10/31/00  ...         NaN         NaN         NaN
4  AMD Athlon 1200  CPU     10/31/00  ...         NaN         NaN         NaN

[5 rows x 13 columns]
import datetime
import dateutil.parser
colnames_map = {
  'Transistors (million)': 'trs',
  'Release Date': 'date',
  'Type': 'type'
}
chip_df = chip_df.rename(columns=colnames_map)
print(chip_df.dtypes)
Product               object
type                  object
date                  object
Process Size (nm)     object
TDP (W)               object
Die Size (mm^2)       object
trs                   object
Freq (GHz)           float64
Foundry               object
Vendor                object
FP16 GFLOPS          float64
FP32 GFLOPS          float64
FP64 GFLOPS          float64
dtype: object
chip_df = chip_df.dropna(subset=['date','trs'])
chip_df['date_obj'] = chip_df['date'].apply(dateutil.parser.parse)
#chip_df['trs'].value_counts()
chip_df = chip_df[chip_df['trs'] != 'unknown'].copy()
chip_df['trs'] = chip_df['trs'].astype(int)
chip_df = chip_df.sort_values(by=['date_obj','trs'])
chip_df = chip_df[chip_df['date_obj'] >= datetime.datetime(2000,1,1,0,0,0)].copy()
chip_df['year'] = chip_df['date_obj'].dt.year
chip_df.head()
                                Product type  ...   date_obj  year
25      Intel Pentium III Xeon 800 2.8V  CPU  ... 2000-01-12  2000
26    Intel Pentium III Xeon 800 5V-12V  CPU  ... 2000-01-12  2000
2282                    NVIDIA Vanta LT  GPU  ... 2000-03-01  2000
2284       Sony Playstation 2 GPU 250nm  GPU  ... 2000-03-04  2000
2285                      Sony GS-250nm  GPU  ... 2000-03-04  2000

[5 rows x 15 columns]
chip_year_df = chip_df[['trs','year']].groupby('year').mean().reset_index()
chip_year_df
    year           trs
0   2000     36.718750
1   2001     50.432203
2   2002     50.434783
3   2003     79.352381
4   2004    118.644231
5   2005    154.919192
6   2006    272.435000
7   2007    299.369565
8   2008    469.546392
9   2009    612.861111
10  2010    830.295964
11  2011   1163.682028
12  2012   1928.944724
13  2013   2156.952381
14  2014   2465.398936
15  2015   2734.922619
16  2016   4372.013072
17  2017   5424.580645
18  2018   8009.682540
19  2019   6671.800000
20  2020  13142.903226
21  2021  18287.222222
22  2022  23560.364964
23  2023  36357.517730
24  2024  28270.833333
import plotly.graph_objects as go
tr_year_fig = go.Figure();
tr_year_fig.add_trace(
  go.Scatter(
    x=chip_year_df['year'],
    y=chip_year_df['trs'],
    mode='lines+markers'
  )
);
tr_year_fig.update_layout(template='simple_white').show()
chip_type_df = chip_df[['trs','year','type']].groupby(['year','type']).mean().reset_index()
color_map = {
    'CPU': 'blue',
    'GPU': 'orange'
}
chip_type_df['color'] = chip_type_df['type'].apply(lambda x: color_map[x])
tr_type_fig = go.Figure();
for cur_type_name in color_map.keys():
    cur_type_df = chip_type_df[chip_type_df['type'] == cur_type_name].copy()
    tr_type_fig.add_trace(
        go.Scatter(
            x=cur_type_df['year'],
            y=cur_type_df['trs'],
            #marker_color=cur_type_df['color'],
            mode='lines+markers'
        )
    );
tr_type_fig.update_layout(template='simple_white').show()