Simple Regression Model¶

from river import (
    metrics,
    compose,
    preprocessing,
    datasets,
    stats,
    feature_extraction,
)
from deep_river.regression import Regressor
from torch import nn
from pprint import pprint
from tqdm import tqdm

dataset = datasets.Bikes()

for x, y in dataset:
    pprint(x)
    print(f"Number of available bikes: {y}")
    break

{'clouds': 75,
 'description': 'light rain',
 'humidity': 81,
 'moment': datetime.datetime(2016, 4, 1, 0, 0, 7),
 'pressure': 1017.0,
 'station': 'metro-canal-du-midi',
 'temperature': 6.54,
 'wind': 9.3}
Number of available bikes: 1

class MyModule(nn.Module):
    def __init__(self, n_features):
        super(MyModule, self).__init__()
        self.dense0 = nn.Linear(n_features, 5)
        self.nonlin = nn.ReLU()
        self.dense1 = nn.Linear(5, 1)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.nonlin(self.dense1(X))
        X = self.softmax(X)
        return X


def get_hour(x):
    x["hour"] = x["moment"].hour
    return x

metric = metrics.MAE()

model_pipeline = compose.Select(
    "clouds", "humidity", "pressure", "temperature", "wind"
)
model_pipeline += get_hour | feature_extraction.TargetAgg(
    by=["station", "hour"], how=stats.Mean()
)
model_pipeline |= preprocessing.StandardScaler()
model_pipeline |= Regressor(module=MyModule(10), loss_fn="mse", optimizer_fn="sgd")
model_pipeline

['clouds', [...]

Select (
  clouds
  humidity
  pressure
  temperature
  wind
)

get_hour


def get_hour(x):
    x["hour"] = x["moment"].hour
    return x

y_mean_by_station_and_hour

TargetAgg (
  by=['station', 'hour']
  how=Mean ()
  target_name="y"
)

StandardScaler

StandardScaler (
  with_std=True
)

Regressor

Regressor (
  module=MyModule(
  (dense0): Linear(in_features=10, out_features=5, bias=True)
  (nonlin): ReLU()
  (dense1): Linear(in_features=5, out_features=1, bias=True)
  (softmax): Softmax(dim=-1)
)
  loss_fn="mse"
  optimizer_fn="sgd"
  lr=0.001
  is_feature_incremental=False
  device="cpu"
  seed=42
)

for x, y in tqdm(dataset.take(5000)):
    y_pred = model_pipeline.predict_one(x)
    metric.update(y_true=y, y_pred=y_pred)
    model_pipeline.learn_one(x=x, y=y)
print(f"MAE: {metric.get():.2f}")

0it [00:00, ?it/s]

53it [00:00, 525.82it/s]

157it [00:00, 821.97it/s]

258it [00:00, 906.00it/s]

365it [00:00, 969.28it/s]

474it [00:00, 1010.64it/s]

585it [00:00, 1041.72it/s]

695it [00:00, 1059.12it/s]

806it [00:00, 1072.91it/s]

914it [00:00, 1074.63it/s]

1028it [00:01, 1091.78it/s]

1147it [00:01, 1118.35it/s]

1259it [00:01, 1106.12it/s]

1370it [00:01, 1066.01it/s]

1479it [00:01, 1072.76it/s]

1587it [00:01, 1073.15it/s]

1698it [00:01, 1083.37it/s]

1808it [00:01, 1087.79it/s]

1918it [00:01, 1090.43it/s]

2028it [00:01, 1088.89it/s]

2138it [00:02, 1092.02it/s]

2251it [00:02, 1103.27it/s]

2365it [00:02, 1113.66it/s]

2477it [00:02, 1108.26it/s]

2591it [00:02, 1115.86it/s]

2703it [00:02, 1102.41it/s]

2814it [00:02, 1095.06it/s]

2934it [00:02, 1123.38it/s]

3047it [00:02, 1123.59it/s]

3160it [00:02, 1114.18it/s]

3272it [00:03, 1067.75it/s]

3380it [00:03, 1063.18it/s]

3487it [00:03, 1064.14it/s]

3594it [00:03, 1052.28it/s]

3700it [00:03, 1048.48it/s]

3808it [00:03, 1054.64it/s]

3916it [00:03, 1059.51it/s]

4023it [00:03, 1045.39it/s]

4129it [00:03, 1048.36it/s]

4239it [00:03, 1061.50it/s]

4350it [00:04, 1074.71it/s]

4458it [00:04, 1072.10it/s]

4566it [00:04, 1073.53it/s]

4675it [00:04, 1077.60it/s]

4784it [00:04, 1079.27it/s]

4897it [00:04, 1092.86it/s]

5000it [00:04, 1070.85it/s]

MAE: 6.83