library(tidyverse)
library(lubridate)
library(plotly)DSAN 5200: Lab 5
chip_df <- read_csv("data/chip_dataset.csv")
chip_df <- chip_df |>
rename(
trs = `Transistors (million)`,
date = `Release Date`,
type = `Type`
) |>
mutate(
date_obj = mdy(date)
) |>
arrange(date_obj, trs)
chip_df$trs <- as.numeric(as.character(chip_df$trs))
chip_df <- chip_df |>
drop_na(trs, date_obj)
chip_df |> select(Product, date_obj, trs) |> head()| Product | date_obj | trs |
|---|---|---|
| NVIDIA NV5 | 1999-03-15 | 15 |
| Intel Pentium III Xeon 800 2.8V | 2000-01-12 | 28 |
| Intel Pentium III Xeon 800 5V-12V | 2000-01-12 | 28 |
| NVIDIA Vanta LT | 2000-03-01 | 15 |
| Sony Playstation 2 GPU 250nm | 2000-03-04 | 54 |
| Sony GS-250nm | 2000-03-04 | 54 |
chip_df <- chip_df |>
filter(
lubridate::year(date_obj) >= 2000
)chip_df |> glimpse()Rows: 3,997
Columns: 14
$ Product <chr> "Intel Pentium III Xeon 800 2.8V", "Intel Pentium …
$ type <chr> "CPU", "CPU", "GPU", "GPU", "GPU", "GPU", "GPU", "…
$ date <chr> "1/12/00", "1/12/00", "3/1/00", "3/4/00", "3/4/00"…
$ `Process Size (nm)` <chr> "180", "180", "250", "250", "250", "180", "180", "…
$ `TDP (W)` <chr> "25", "25", "unknown", "79", NA, NA, "23", "23", "…
$ `Die Size (mm^2)` <chr> "106", "106", "63", "279", "279", "111", "111", "1…
$ trs <dbl> 28, 28, 15, 54, 54, 30, 30, 30, 30, 30, 28, 28, 25…
$ `Freq (GHz)` <dbl> 800, 800, 105, 147, NA, NA, 166, 166, 166, NA, 866…
$ Foundry <chr> "Intel", "Intel", "TSMC", "Sony", "Sony", "TSMC", …
$ Vendor <chr> "Intel", "Intel", "NVIDIA", "Sony", "Sony", "ATI",…
$ `FP16 GFLOPS` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `FP32 GFLOPS` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `FP64 GFLOPS` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ date_obj <date> 2000-01-12, 2000-01-12, 2000-03-01, 2000-03-04, 2…
sum(is.na(chip_df$date_obj))[1] 0
chip_df <- chip_df |>
mutate(
yr = lubridate::year(date_obj)
)
chip_df |> select(date_obj, trs, yr) |> head()| date_obj | trs | yr |
|---|---|---|
| 2000-01-12 | 28 | 2000 |
| 2000-01-12 | 28 | 2000 |
| 2000-03-01 | 15 | 2000 |
| 2000-03-04 | 54 | 2000 |
| 2000-03-04 | 54 | 2000 |
| 2000-04-01 | 30 | 2000 |
by_year_plot <- plot_ly(
chip_df,
x=~yr, y=~trs, type='scatter', mode='markers+lines'
)
by_year_plotyearly_mean_df <- chip_df |>
group_by(yr) |>
summarize(
yr_mean_trs = mean(trs)
)
#yearly_mean_df |> head()
yearly_mean_plot <- plot_ly(
yearly_mean_df, x=~yr, y=~yr_mean_trs
)
yearly_mean_plot |>
add_markers() |>
add_lines()yearly_mean_type_df <- chip_df |>
group_by(yr, type) |>
summarize(
yr_mean_trs = mean(trs)
) |>
ungroup()`summarise()` has grouped output by 'yr'. You can override using the `.groups`
argument.
# yearly_mean_type_df |> head()
yearly_mean_type_df |>
plot_ly(
x=~yr, y=~yr_mean_trs,
color=~type, colors=c('orange','lightblue')
) |>
add_lines() |>
add_markers()yearly_mean_type_df |>
plot_ly(
x=~yr, y=~yr_mean_trs,
color=~type, colors=c('orange','lightblue')
) |>
add_trace(type='scatter', mode = 'lines+markers')Onto Python!
import pandas as pd
import numpy as np
import plotly
print("Hello world")Hello world
chip_df = pd.read_csv("data/chip_dataset.csv")
print(chip_df.columns)Index(['Product', 'Type', 'Release Date', 'Process Size (nm)', 'TDP (W)',
'Die Size (mm^2)', 'Transistors (million)', 'Freq (GHz)', 'Foundry',
'Vendor', 'FP16 GFLOPS', 'FP32 GFLOPS', 'FP64 GFLOPS'],
dtype='object')
chip_df.head() Product Type Release Date ... FP16 GFLOPS FP32 GFLOPS FP64 GFLOPS
0 AMD Athlon 1000 CPU 6/5/00 ... NaN NaN NaN
1 AMD Athlon 1000 CPU 10/31/00 ... NaN NaN NaN
2 AMD Athlon 1100 CPU 8/14/00 ... NaN NaN NaN
3 AMD Athlon 1133 CPU 10/31/00 ... NaN NaN NaN
4 AMD Athlon 1200 CPU 10/31/00 ... NaN NaN NaN
[5 rows x 13 columns]
import datetime
import dateutil.parser
colnames_map = {
'Transistors (million)': 'trs',
'Release Date': 'date',
'Type': 'type'
}
chip_df = chip_df.rename(columns=colnames_map)
print(chip_df.dtypes)Product object
type object
date object
Process Size (nm) object
TDP (W) object
Die Size (mm^2) object
trs object
Freq (GHz) float64
Foundry object
Vendor object
FP16 GFLOPS float64
FP32 GFLOPS float64
FP64 GFLOPS float64
dtype: object
chip_df = chip_df.dropna(subset=['date','trs'])
chip_df['date_obj'] = chip_df['date'].apply(dateutil.parser.parse)
#chip_df['trs'].value_counts()
chip_df = chip_df[chip_df['trs'] != 'unknown'].copy()
chip_df['trs'] = chip_df['trs'].astype(int)
chip_df = chip_df.sort_values(by=['date_obj','trs'])
chip_df = chip_df[chip_df['date_obj'] >= datetime.datetime(2000,1,1,0,0,0)].copy()
chip_df['year'] = chip_df['date_obj'].dt.year
chip_df.head() Product type ... date_obj year
25 Intel Pentium III Xeon 800 2.8V CPU ... 2000-01-12 2000
26 Intel Pentium III Xeon 800 5V-12V CPU ... 2000-01-12 2000
2282 NVIDIA Vanta LT GPU ... 2000-03-01 2000
2284 Sony Playstation 2 GPU 250nm GPU ... 2000-03-04 2000
2285 Sony GS-250nm GPU ... 2000-03-04 2000
[5 rows x 15 columns]
chip_year_df = chip_df[['trs','year']].groupby('year').mean().reset_index()
chip_year_df year trs
0 2000 36.718750
1 2001 50.432203
2 2002 50.434783
3 2003 79.352381
4 2004 118.644231
5 2005 154.919192
6 2006 272.435000
7 2007 299.369565
8 2008 469.546392
9 2009 612.861111
10 2010 830.295964
11 2011 1163.682028
12 2012 1928.944724
13 2013 2156.952381
14 2014 2465.398936
15 2015 2734.922619
16 2016 4372.013072
17 2017 5424.580645
18 2018 8009.682540
19 2019 6671.800000
20 2020 13142.903226
21 2021 18287.222222
22 2022 23560.364964
23 2023 36357.517730
24 2024 28270.833333
import plotly.graph_objects as go
tr_year_fig = go.Figure();
tr_year_fig.add_trace(
go.Scatter(
x=chip_year_df['year'],
y=chip_year_df['trs'],
mode='lines+markers'
)
);
tr_year_fig.update_layout(template='simple_white').show()chip_type_df = chip_df[['trs','year','type']].groupby(['year','type']).mean().reset_index()
color_map = {
'CPU': 'blue',
'GPU': 'orange'
}
chip_type_df['color'] = chip_type_df['type'].apply(lambda x: color_map[x])
tr_type_fig = go.Figure();
for cur_type_name in color_map.keys():
cur_type_df = chip_type_df[chip_type_df['type'] == cur_type_name].copy()
tr_type_fig.add_trace(
go.Scatter(
x=cur_type_df['year'],
y=cur_type_df['trs'],
#marker_color=cur_type_df['color'],
mode='lines+markers'
)
);
tr_type_fig.update_layout(template='simple_white').show()