Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Newton's implementation during RLOS Fest 2020 #10

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,750 changes: 1,750 additions & 0 deletions RLOS Fest 2020: Estimators Library - A Walkthrough.ipynb

Large diffs are not rendered by default.

73 changes: 73 additions & 0 deletions action_value_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
class ActionValueModel:
r"""
An abstract class representing an action value model for a fixed policy
\pi, a.k.a. Q_\pi, in a Reinforcement Learning (RL) setting.

All other action value models should subclass it. All subclasses should
override ``q``, that returns the Q value prediction for an Observation
and an action Q(obs.state, action) ; and ``update'', which updates the
Q value model given two subsequent observations.
"""

def __init__(self, horizon, action_num, policy):
r"""
Initializes variables to store episode data and all the Q models.

Arguments:
horizon: RL horizon (fixed horizon setting), a.k.a H.
actions_num: number of possible actions (indexing starts at 1)
policy: a Policy object for which we want to learn Q.
"""
self.horizon = horizon
self.action_num = action_num
self.policy = policy

def q(self, index, observation, action):
r"""
Tge prediction for Q_\pi at index, for the state action pair
(observation, action).

Arguments:
index: the number of steps already passed in the episode,
in 1 ... H
obseravtion: an Observation, where obs.state is s_i in Q_\pi(s_i, a_i)
action: the action a in Q_\pi(s_i, a_i)

Returns the predicted cumulative reward for the last t observations of
the episode, playing a then \pi
"""
raise NotImplementedError

def v(self, index, observation):
r"""
Integrates Q_\pi over all action and proabs for observation at index
to return the cumulative reward to the end of the episode, playing \pi.

Arguments:
index: the number of steps already passed in the episode,
in 1 ... H
obseravtion: an Observation, where obs.state is s_i in V_\pi(s_i)

Returns the predicted cumulative reward for the last t observations of
the episode
"""
if index > self.horizon:
return 0

v = 0
for action, p in enumerate(self.policy.get_action_probas(index, observation)):
v += p * self.q(index, observation, action+1)
return v

def update(self, index, observation, next_observation):
r"""
Updates Q_\pi at index, with a new observation and its next state.

Arguments:
index: the number of steps already passed in the episode,
in 1 ... H
obseravtion: an Observation of state, action, proba, reward
next_observation: an Observation used to get the next state after
observation in Q learning
"""
raise NotImplementedError
131 changes: 117 additions & 14 deletions basic-usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,29 @@
import ips_snips
import mle
import ds_parse


def compute_estimates(log_fp):
import dr
import policy
from vowpalwabbit import pyvw
import os

def compute_estimates(log_fp, dr_mode, pol):
"""
Using a logging policy, computes estimates of expected rewards for different target policies as well as confidence intervals around such estimates, and reports the results

Parameters
----------
log_fp : file(json)
Logging policy
dr_mode: string
The mode of the doubly robust estimator, can be {batch, online}
pol : string
target policy to evaluate

Returns
-------
None
"""

# Init estimators
online = ips_snips.Estimator()
baseline1 = ips_snips.Estimator()
Expand All @@ -17,10 +37,39 @@ def compute_estimates(log_fp):
baseline1_cressieread = cressieread.Estimator()
baselineR_cressieread = cressieread.Estimator()

print('Processing: {}'.format(log_fp))
dre_batch = dr.Estimator()
baseline1_dre_batch = dr.Estimator()
baselineR_dre_batch = dr.Estimator()

dre_online = dr.Estimator()
baseline1_dre_online = dr.Estimator()
baselineR_dre_online = dr.Estimator()

reward_estimator_model = pyvw.vw(quiet=True)

if(pol=='constant'):
policy_pi = policy.Constant(10, 5)
elif(pol=='uniformrandom'):
policy_pi = policy.UniformRandom(10)

lines_count = sum(1 for line in open(log_fp))
train_ratio = 0.5
train_size = int(lines_count * train_ratio)

if(dr_mode=='batch'):
print("Batch train size: ", train_size)

load_pretrained_model = False
train_data_str = ""
train_data_file = open("xar.dat","w+")
train_data_batch = []

print('\n Computing estimates... Processing: {}'.format(log_fp))
bytes_count = 0
tot_bytes = os.path.getsize(log_fp)
evts = 0
idxx = 0

for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
# display progress
bytes_count += len(x)
Expand All @@ -32,7 +81,8 @@ def compute_estimates(log_fp):

# parse dsjson file
if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
data = ds_parse.json_cooked(x)

data = ds_parse.ds_json_parse(x)

if data['skipLearn']:
continue
Expand All @@ -51,44 +101,97 @@ def compute_estimates(log_fp):
online_cressieread.add_example(data['p'], r, data['p'])
baseline1_cressieread.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
baselineR_cressieread.add_example(data['p'], r, 1/data['num_a'])

if(dr_mode == 'online'):
#construct training example (x, a, r)
train_example = str(data['r']) + " |ANamespace Action:" + str(data['a']) + " |SharedNamespace c_user:" + str(data['c_user']) + " c_time_of_day:" + str(data['c_time_of_day'])

#At the beginning, you don't have a model to predict with, so you can only learn a reward estimator
if(idxx==0):
reward_estimator_model = dre_online.reward_estimator(train_example, reward_estimator_model, dr_mode, load_pretrained_model)

#Now you have a reward estimator to predict with
elif(idxx>1):

x = " |SharedNamespace c_user:" + str(data['c_user']) + " c_time_of_day:" + str(data['c_time_of_day'])

#compute estimate
dre_online.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], data['p'], policy_pi.get_action_probas(0, 0), reward_estimator_model)
baseline1_dre_online.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], 1 if data['a'] == 1 else 0, policy_pi.get_action_probas(0, 0), reward_estimator_model)
baselineR_dre_online.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], 1/data['num_a'], policy_pi.get_action_probas(0, 0), reward_estimator_model)

#Update reward estimator
reward_estimator_model = dre_online.reward_estimator(train_example, reward_estimator_model, dr_mode, load_pretrained_model)

elif(dr_mode == 'batch'):

#accumulate training examples
if(idxx < train_size):
train_example = str(data['r']) + " |ANamespace Action:" + str(data['a']) + " |SharedNamespace c_user:" + str(data['c_user']) + " c_time_of_day:" + str(data['c_time_of_day'])

train_data_batch.append(train_example)
train_data_file.write(train_example + "\r\n")

#use batch training examples to train a reward estimator
else:
if(idxx == train_size):
train_data_file.close()
reward_estimator_model = dre_batch.reward_estimator(train_data_batch, reward_estimator_model, dr_mode, load_pretrained_model)

#use reward estimator to compute estimate
x = " |SharedNamespace c_user:" + str(data['c_user']) + " c_time_of_day:" + str(data['c_time_of_day'])
dre_batch.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], data['p'], policy_pi.get_action_probas(0, 0), reward_estimator_model)
baseline1_dre_batch.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], 1 if data['a'] == 1 else 0, policy_pi.get_action_probas(0, 0), reward_estimator_model)
baselineR_dre_batch.add_example(x, data['a'], data['a_vec'], data['p'], data['r'], 1/data['num_a'], policy_pi.get_action_probas(0, 0), reward_estimator_model)

evts += 1
idxx += 1

if log_fp.endswith('.gz'):
len_text = ds_parse.update_progress(i+1)
else:
len_text = ds_parse.update_progress(bytes_count,tot_bytes)


print('\nProcessed {} events out of {} lines'.format(evts,i+1))

print('online_ips:',online.get_estimate('ips'))
print('\nonline_ips:',online.get_estimate('ips'))

print('baseline1_ips:', baseline1.get_estimate('ips'))
print('baseline1 gaussian ci:', baseline1.get_interval('gaussian'))
print('baseline1 clopper pearson ci:', baseline1.get_interval('clopper-pearson'))

print('baselineR_ips:',baselineR.get_estimate('ips'))
print('\nbaselineR_ips:',baselineR.get_estimate('ips'))
print('baselineR gaussian ci:', baselineR.get_interval('gaussian'))
print('baselineR clopper pearson ci:', baselineR.get_interval('clopper-pearson'))


print('online_snips:',online.get_estimate('snips'))
print('\nonline_snips:',online.get_estimate('snips'))
print('baseline1_snips:',baseline1.get_estimate('snips'))
print('baselineR_snips:',baselineR.get_estimate('snips'))

print('online_mle:',online_mle.get_estimate())
print('\nonline_mle:',online_mle.get_estimate())
print('baseline1_mle:',baseline1_mle.get_estimate())
print('baselineR_mle:',baselineR_mle.get_estimate())

print('online_cressieread:',online_cressieread.get_estimate())
print('\nonline_cressieread:',online_cressieread.get_estimate())
print('baseline1_cressieread:',baseline1_cressieread.get_estimate())
print('baselineR_cressieread:',baselineR_cressieread.get_estimate())

if __name__ == '__main__':
if(dr_mode=='online'):
print('\ndre_online:', dre_online.get_estimate())
print('baseline1_dre_online:', baseline1_dre_online.get_estimate())
print('baselineR_dre_online:', baselineR_dre_online.get_estimate())
elif(dr_mode=='batch'):
print('\ndre_batch:', dre_batch.get_estimate())
print('baseline1_dre_batch:', baseline1_dre_batch.get_estimate())
print('baselineR_dre_batch:', baselineR_dre_batch.get_estimate())

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-l','--log_fp', help="data file path (.json or .json.gz format - each line is a dsjson)", required=True)

parser.add_argument('-m','--mode', help="Doubly Robust estimator mode", required=True)
parser.add_argument('-p','--policy', help="Policy to evaluate one of {constant, uniformrandom}", required=True)

args = parser.parse_args()

compute_estimates(args.log_fp)
compute_estimates(args.log_fp, args.mode, args.policy)
Loading