love_lro.tex

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Author template for Operations Reseacrh (opre) for articles with no e-companion (EC)
%% Mirko Janc, Ph.D., INFORMS, mirko.janc@informs.org
%% ver. 0.95, December 2010
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\documentclass[opre,blindrev]{informs3}
\documentclass[opre,nonblindrev]{informs3} % current default for manuscript submission

\OneAndAHalfSpacedXI % current default line spacing
%%\OneAndAHalfSpacedXII
%%\DoubleSpacedXII
%%\DoubleSpacedXI

% If hyperref is used, dvi-to-ps driver of choice must be declared as
%   an additional option to the \documentclass. For example
%\documentclass[dvips,opre]{informs3}      % if dvips is used
%\documentclass[dvipsone,opre]{informs3}   % if dvipsone is used, etc.

%%% OPRE uses endnotes. If you do not use them, put a percent sign before
%%% the \theendnotes command. This template does show how to use them.
\usepackage{endnotes}
\let\footnote=\endnote
\let\enotesize=\normalsize
\def\notesname{Endnotes}%
\def\makeenmark{$^{\theenmark}$}
\def\enoteformat{\rightskip0pt\leftskip0pt\parindent=1.75em
  \leavevmode\llap{\theenmark.\enskip}}

% Private macros here (check that there is no clash with the style)

% Natbib setup for author-year style
\usepackage{natbib}
 \bibpunct[, ]{(}{)}{,}{a}{}{,}%
 \def\bibfont{\small}%
 \def\bibsep{\smallskipamount}%
 \def\bibhang{24pt}%
 \def\newblock{\ }%
 \def\BIBand{and}%

%% Setup of theorem styles. Outcomment only one.
%% Preferred default is the first option.
\TheoremsNumberedThrough     % Preferred (Theorem 1, Lemma 1, Theorem 2)
%\TheoremsNumberedByChapter  % (Theorem 1.1, Lema 1.1, Theorem 1.2)
\ECRepeatTheorems

%% Setup of the equation numbering system. Outcomment only one.
%% Preferred default is the first option.
\EquationsNumberedThrough    % Default: (1), (2), ...
%\EquationsNumberedBySection % (1.1), (1.2), ...

% In the reviewing and copyediting stage enter the manuscript number.
%\MANUSCRIPTNO{} % When the article is logged in and DOI assigned to it,
                 %   this manuscript number is no longer necessary

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{setspace}
\usepackage{paralist}
\usepackage{graphicx}
\usepackage{url}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{multicol}
\usepackage{csvsimple}

\allowdisplaybreaks[4]


% Frequently used general mathematics
\newcommand{\R}{{\mathbb{R}}}
\newcommand{\Rp}{\R^+}
\newcommand{\Z}{{\mathbb{Z}}}
\newcommand{\Zp}{\Z^+}
\newcommand{\Q}{\mathbb{Q}}
\newcommand{\N}{\mathbb{N}}

% Commands for probability
\renewcommand{\P}{\mathbb{P}}
\newcommand{\E}{\mathbb{E}}
%\renewcommand{\P}[1]{\P \left[ #1 \right]}
\newcommand{\e}[1]{\E \left[ #1 \right]}
\renewcommand{\ee}[2]{\E_{#1} \left[ #2 \right]}

% Definitions of variables
\newcommand{\X}{X}
\newcommand{\x}{\mathbf{x}}
\newcommand{\xh}{\hat{\x}}
\newcommand{\lh}{\hat{\lambda}}
\newcommand{\mh}{\hat{\mu}}
\newcommand{\xs}{\x^*}
\newcommand{\xit}{\tilde{\mathbf{\xi}}}
\newcommand{\zt}{\tilde{z}}
\newcommand{\zs}{z^*}
\newcommand{\bpi}{\mathbf{\pi}}
\newcommand{\bpih}{\hat{\bpi}}

% Further variables
\newcommand{\y}{\mathbf{y}}
\renewcommand{\c}{\mathbf{c}}
\newcommand{\A}{\mathbf{A}}
\renewcommand{\b}{\mathbf{b}}
\newcommand{\g}{\mathbf{g}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\B}{\mathbf{B}}
\renewcommand{\d}{\mathbf{d}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\M}{\mathbf{M}}
\renewcommand{\h}{\mathbf{h}}


% Probability vectors
\newcommand{\q}{\mathbf{q}}
\newcommand{\p}{\mathbf{p}}

% Convergence of \plp
\newcommand{\qtrue}{\q^{\text{true}}}
\newcommand{\ob}{\bar{\omega}}

% Useful mathematics functions
\newcommand{\keywords}[1]{\par\noindent\enspace\ignorespaces\textbf{Keywords:} #1}
% \newcommand{\keywords}[1]{\par\addvspace\baselineskip\noindent\keywordname\enspace\ignorespaces #1}
% \DeclareMathOperator*{\argmin}{argmin}
% \theoremstyle{plain}
% \newtheorem{theorem}{Theorem}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{corollary}[theorem]{Corollary}
% 
% \theoremstyle{definition}
% \newtheorem{definition}[theorem]{Definition}
% 
% \theoremstyle{remark}
% \newtheorem{remark}[theorem]{Remark}
\newtheorem{property}{Property}

\newcommand{\st}{\mbox{s.t.}}

% Naming shortcuts
\newcommand{\plp}{$\phi$LP-2}

\bibliographystyle{ormsv080}

%%%%%%%%%%%%%%%%
\begin{document}
%%%%%%%%%%%%%%%%

% Outcomment only when entries are known. Otherwise leave as is and 
%   default values will be used.
%\setcounter{page}{1}
%\VOLUME{00}%
%\NO{0}%
%\MONTH{Xxxxx}% (month or a similar seasonal id)
%\YEAR{0000}% e.g., 2005
%\FIRSTPAGE{000}%
%\LASTPAGE{000}%
%\SHORTYEAR{00}% shortened year (two-digit)
%\ISSUE{0000} %
%\LONGFIRSTPAGE{0001} %
%\DOI{10.1287/xxxx.0000.0000}%

% Author's names for the running heads
% Sample depending on the number of authors;
% \RUNAUTHOR{Jones}
% \RUNAUTHOR{Jones and Wilson}
% \RUNAUTHOR{Jones, Miller, and Wilson}
% \RUNAUTHOR{Jones et al.} % for four or more authors
% Enter authors following the given pattern:
%\RUNAUTHOR{Love and Bayraksan}
\RUNAUTHOR{\ }

% Title or shortened title suitable for running heads. Sample:
% \RUNTITLE{Bundling Information Goods of Decreasing Value}
% Enter the (shortened) title:
%\RUNTITLE{Phi-Divergence Constrained Ambiguous Stochastic Programs}
\RUNTITLE{\ }


% Full title. Sample:
% \TITLE{Bundling Information Goods of Decreasing Value}
% Enter the full title:
\TITLE{Phi-Divergence Constrained Ambiguous Stochastic Programs for Data-Driven Optimization}

% Block of authors and their affiliations starts here:
% NOTE: Authors with same affiliation, if the order of authors allows, 
%   should be entered in ONE field, separated by a comma. 
%   \EMAIL field can be repeated if more than one author
\ARTICLEAUTHORS{%
\AUTHOR{David K.\ Love}
\AFF{American Express, New York, NY,
\EMAIL{love.david.k@gmail.com}
%\AFF{Graduate Program in Applied Mathematics, University of Arizona, %\EMAIL{dlove@email.arizona.edu}%, \url{http://math.arizona.edu/~dlove/}
}
\AUTHOR{G\"{u}zin~Bayraksan}
\AFF{Department of Integrated Systems Engineering, The Ohio State University, \EMAIL{bayraksan.1@osu.edu}%, %\url{http://www-iwse.eng.ohio-state.edu/biosketch\_GBayraksan.cfm}
}
% Enter all authors
} % end of the block

\ABSTRACT{%  should be <= 200 words. Exactly at that!
	This paper investigates the use of $\phi$-divergences in ambiguous (or distributionally robust) two-stage stochastic programs. 
	Classical stochastic programming assumes the distribution of uncertain parameters are known. 
	However, the true distribution is unknown in many applications. 
	Especially in cases where there is little data or not much trust in the data, an ambiguity set of distributions can be used to hedge against the distributional uncertainty. 
	$\phi$-divergences (e.g., Kullback-Leibler divergence, $\chi^2$ distance, etc.) provide a natural way to create an ambiguity set of distributions that are centered around a nominal distribution. 
	The nominal distribution can be obtained by using observed data, expert opinions, simulations, and so forth.  
	In this paper, we present a classification of $\phi$-divergences to elucidate their use for models with different properties and sources of data.
	We illustrate our classification on $\phi$-divergences that result in common risk optimization models. 
	A condition for assessing the value of collecting additional data is derived, and we demonstrate that the $\phi$-divergence-based ambiguous program behaves essentially the same as the associated non-ambiguous stochastic program as more data is collected.
	We present a decomposition-based solution algorithm to solve the resulting model. 
	Finally, we demonstrate the behavior of $\phi$-divergences in an optimization setting for a numerical example.  
}%

% Sample 
%\KEYWORDS{deterministic inventory theory; infinite linear programming duality; 
%  existence of optimal policies; semi-Markov decision process; cyclic schedule}

% Fill in data. If unknown, outcomment the field
\KEYWORDS{Ambiguous stochastic programming, distributionally robust optimization, phi-divergences, data-driven optimization}
%Optimization under uncertainty, water resources management,  ambiguous stochastic programming, robust optimization, environmental sustainability}
%\HISTORY{}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Samples of sectioning (and labeling) in IJOC
% NOTE: (1) \section and \subsection do NOT end with a period
%       (2) \subsubsection and lower need end punctuation
%       (3) capitalization is as shown (title style).
%
%\section{Introduction.}\label{intro} %%1.
%\subsection{Duality and the Classical EOQ Problem.}\label{class-EOQ} %% 1.1.
%\subsection{Outline.}\label{outline1} %% 1.2.
%\subsubsection{Cyclic Schedules for the General Deterministic SMDP.}
%  \label{cyclic-schedules} %% 1.2.1
%\section{Problem Description.}\label{problemdescription} %% 2.

\section{Introduction}

Many optimization problems can be modeled by stochastic programs minimizing the expected value of an uncertain objective function.
However, if the distribution of the uncertain parameters used in the model is incorrect, the stochastic program can give highly suboptimal results.
This concern has led to the development of a modeling technique that replaces the probability distribution by a set of distributions. 
Then, the model optimizes the worst-case expectation with respect to the distributions in this set to hedge against the distributional uncertainty.
This set of distributions is referred to as the {\it ambiguity set} of distributions, sometimes called the {\it uncertainty set}.   
This approach is not new, with early results dating back to \cite{scarf1958min} and Dupa{\v{c}}ov{\'{a}} as \cite{zackova1966minimax}. 
This type of model has been referred to as an {\it ambiguous stochastic program}; see, for instance, \cite{pflug2007ambiguity} and \cite{erdogan2006ambiguous}. 
More recently, this approach has been called {\it distributionally robust optimization} \citep{delage2010distributionally,goh_sim_10,mehrotra_papp_14,hanasusanto_etal_15}.  


There are different ways to form the ambiguity set of distributions.
% (see Section~\ref{sec:lit}). 
One recent approach that has been proposed by \citet{bental2013robust} uses a set of distributions that are sufficiently close to a given ``nominal'' distribution according to a $\phi$-divergence. 
$\phi$-divergences quantify distances between probability distributions; we will shortly review them in Section \ref{sec:phi_divergences}. 
Of particular interest is the case when the nominal distribution takes the form of the empirical distribution determined by direct observation of data.
However, this is not the only means of obtaining data. 
In addition to direct observation, data can come from simulations, forecasts, or expert opinions.
% that the decision maker would especially like to be robust against.


%\subsection{Contributions}

In this paper\footnote{This work is largely derived from the dissertation \cite{love_13}. 
An earlier version of Section \ref{sec:classification} has appeared in the conference paper \cite{love2014classification}, and select results have been summarized in the tutorial \cite{bayraksan_love_15}. 
This paper contains more results, full derivations and proofs, and further numerical illustration of results.  
Parts of \cite{bayraksan_love_15} have been included by permission.}
we adapt the $\phi$-divergence based ambiguity sets to two-stage stochastic linear programs with recourse. 
We call the resulting model two-stage $\phi$-divergence constrained ambiguous stochastic linear program with recourse and denote it as \plp. 
% and examine its properties. 
While we focus on \plp, some of our results apply to a broad class of distributionally robust optimization problems using $\phi$-divergences---we will point to these shortly. 
%Throughout the paper, we consider distributions with finite support. 
Data-driven models in the literature typically use empirical probabilities obtained through direct observations.
In our modeling framework, we also allow unobserved data points (e.g., those given by expert opinions) to be represented in the model with zero nominal probabilities. 
We examine this case in more detail in the paper.


%\subsection{Related Literature}

Stochastic programs with uncertain objective functions have long been studied by applying the minimax approach to an expected cost; see, e.g., \cite{zackova1966minimax} and \cite{dupacova_87}.
Two seminal papers by \cite{shapiro2002minimax} and \cite{shapiro2004class} developed methods for converting stochastic minimax problems into equivalent stochastic programs with a certain distribution, laying the foundation for a commonly used reformulation technique.

In recent years, there has been a growing interest in distributionally robust methods.
One common method for forming the ambiguity set is moment based, where all distributions that have the same moments (mean, variance, covariance, etc.) are admitted into the set.
An early example comes from \citet{scarf1958min}, who provided a distributionally robust model for the newsvendor problem.
More recent works using moment-based ambiguity sets include \cite{delage2010distributionally} and \cite{wiesemann2013distributionally}. 
Probability metrics, including the Kantorovich or Wasserstein metric \citep{pflug2007ambiguity,eskuhn_15}, Prokhorov metric \citep{erdogan2006ambiguous}, and $\zeta$-structure metrics \citep{zhao2015}, have also been used. % to form ambiguity sets. 
\cite{hanasusanto_etal_15} provide a comprehensive review of different types of ambiguity sets. 
We refer the readers to this paper and references therein for more details on different types of ambiguity sets. 

As mentioned above, \cite{bental2013robust} first systematically studied the $\phi$-divergence based models and their computational tractability. 
\cite{jiang2012data} investigated $\phi$-divergence based ambiguous chance-constrained programs, providing an exact approach to solve them; see also \cite{yanikoglu2012}. 
Specific $\phi$-divergences---including the $\chi^2$-distance \citep{klabjan2013robust}, Kullback-Leibler divergence \citep{calafiore2007ambiguous,hukullback,wang2010likelihood} and the variation distance \citep{jiang2015variation}---were also studied. 
\citet{hukullback} and \citet{jiang2015variation} differ from this work and the others by considering continuous distributions.
Close to our work, \cite{bertsimas_gupta_kallus_14} study robust problems with ambiguity sets formed via goodness-of-fit test statistics. Some of their results include $\phi$-divergences, but they consider other tests as well. 
Our work unites these papers with various $\phi$-divergences in the finite support case, providing insight into conditions where each $\phi$-divergence should be used. 
To the best of our knowledge, this is the first paper examining the behavior of different $\phi$-divergences in an optimization setting. 

%\citet{bental2010soft} presents a ``soft'' robust model that allows for changing the level of robustness across the uncertainty set.
%Although it is not a distributionally robust stochastic program, this soft robust model, like the distributionally robust models, takes the form of a convex risk measure.


The contributions of this paper, along with a motivation to study the corresponding research questions, are as follows:
\begin{itemize}
\item[(i)] 
%One of the open problems identified by \citet{bental2013robust} was to study the performance of different $\phi$-divergences.
Given that there are many $\phi$-divergences, a decision maker is left with the question of how each divergence behaves for his/her problem and which one to choose.
	\begin{itemize}
		\item In this paper we provide a classification of $\phi$-divergences that begins to answer this open question.
Our classification is based on the types of distributions admitted into the ambiguity set.
This analysis provides insight into which class of $\phi$-divergence may be most useful to which type of data and decision maker.
\item Our main classification is a general feature of $\phi$-divergences, and it applies to a broader class of $\phi$-divergence constrained distributionally robust problems than the ones presented in this paper. 
\end{itemize}

\item[(ii)] In a data-driven setting, several important questions arise.
		What happens as we add one more data?
		Will our solution change, and if so, will the overall cost decrease?
		Can we determine sampling from which scenarios result in a better (lower-cost) solution?
		Can we characterize the behavior of the problem as we add more data?
		We provide answers to these questions.

		\begin{itemize}
			\item First, we provide a simple condition to determine if sampling from a particular scenario will lower the cost, which again can be generalized beyond the \plp\ setting.
		We refer to this as the {\it value of additional data}. 
			\item Next, in a data-driven setting---where random data is collected to form the ambiguity set of distributions using $\phi$-divergences---we show that asymptotically, \plp\  behaves essentially the same as a stochastic program with the (unknown) true distribution.
		\end{itemize}


\item[(iii)] Stochastic programs often become quite large, which raises questions of computational tractability.
		We devise a modified Bender's decomposition that can be used to solve \plp\ efficiently by solving only linear problems. 

\item[(iv)]   Finally, we present examples of $\phi$-divergences that result in commonly used risk models and illustrate our classification on these models. 
We also numerically illustrate our results on a small electricity generation example. 
\end{itemize}
\smallskip 

 
%\subsection{Organization}

The rest of the paper is organized as follows.
Section \ref{sec:phi_divergences} introduces $\phi$-divergences and lists several useful properties that are used throughout the paper.
Section \ref{sec:plp2} presents the derivation of $\phi$-divergence constrained ambiguous two-stage stochastic programs with recourse and discusses their basic properties. %, which will be used in later sections.
Data-driven properties of \plp\ are explored in Section \ref{sec:properties}. 
Section \ref{sec:classification} presents a classification of $\phi$-divergences and illustrates different classes using risk models. 
Then, Section \ref{sec:soln_algorithm} discusses a decomposition method for solving the \plp. 
Section \ref{sec:comp_results} numerically illustrates the results of the paper, and finally
%some of the properties of the \plp\ model.
Section \ref{sec:plp_conclusions} concludes with a summary and future work.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{$\phi$-Divergences} %Introduction to
\label{sec:phi_divergences}

In this section we define the concept of a $\phi$-divergence and review several properties of  $\phi$-divergences that will be used throughout the paper. 
\citet{pardo2005statistical} provides a good overview of much of the known properties of $\phi$-divergences. 
We refer the readers to this book for further details.
Many results in this section can be also found in \cite{bental1991certainty,bental2013robust}.

In the finite case, $\phi$-divergences are used to measure the distance between two non-negative vectors $\p = (p_1, \dots, p_n)^T$ and $\q = (q_1, \dots, q_n)^T$.
Specifically, when $\p$ and $\q$ are probability vectors (i.e., satisfying $\sum_{\omega=1}^n p_\omega = \sum_{\omega=1}^n q_\omega = 1$), $\phi$-divergences are used to quantify the distance between two discrete distributions with finite support. 
The $\phi$-divergence is defined by
\[
	I_\phi(\p,\q) = \sum_{\omega=1}^n q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right),
\]
where $\phi(t)$, called the {\it $\phi$-divergence function}, is a convex function on $t \geq 0$ with $\phi(1) = 0$.
Additionally, it is defined that $0 \phi(a/0) = a \lim_{t \rightarrow \infty} \frac{\phi(t)}{t}$ for $a>0$ and $0 \phi(0/0) = 0$.
When both $\p$ and $\q$ are probability vectors---which is our setup throughout this paper---we can further assume that $\phi(t) \geq 0$ without loss of generality. 
Observe that the function $\phi(t)$ can be modified as $\psi(t) = \phi(t) + c(t-1)$ with an appropriately chosen constant $c$ such that $\psi(t) \geq 0$ for all $t\geq 0$ and $I_\psi(\p,\q) = I_\phi(\p,\q)$ for all probability vectors $\p,\q$.
If $\phi(t)$ is differentiable at $t = 1$, this modification can be done by selecting $c = -\phi'(1)$.
%; see, e.g., the Likelihood divergence and Burg entropy in Table~\ref{tb:phi_definitions}.
Throughout the remainder of this paper, we assume $\phi(t) \geq 0$ for $t\geq 0$, although we give an example of a $\phi$-divergence that does not satisfy this condition in Table~\ref{tb:phi_definitions} below. 
%We will discuss this case shortly. 

We extend $\phi(t)$ to the set of reals by setting $\phi(t)=+\infty$ for $t<0$.
We make a technical---but not restrictive---assumption 
%because we will use $\phi$-divergences in an optimization setting. 
%We assume $\phi$ is a closed function. 
that $\phi$ is a closed function because we will use $\phi$-divergences in an optimization setting. 
For a proper convex function, closedness (i.e., its epigraph being closed) is the same as lower semi-continuity \citep{rockafellar_70}. 
(Recall that $\phi$ is a proper function because for at least one $t$ ($t=1$), $\phi(t)=0<\infty$, and $\phi(t)>-\infty$, for all $t\in\mathbb{R}$.)
%Lower semi-continuity is a natural assumption because we will use $\phi$-divergences in an optimization setting.%, and it is satisfied by common  $\phi$-divergences (see Table~\ref{tb:phi_definitions}).
%This assumption is naturally satisfied by $\phi$-divergences that are commonly used in the literature (see Table~\ref{tb:phi_definitions}).
Lower semi-continuity is a desirable property in our setting, and it is satisfied by common $\phi$-divergences (see Table~\ref{tb:phi_definitions}).
  

$\phi$-divergences are not, in general, metrics.
Most $\phi$-divergences do not satisfy the triangle inequality and many are not symmetric in the sense that $I_\phi(\p,\q) \neq I_\phi(\q,\p)$.
One exception is the variation distance, which is equivalent to the $L^1$-distance between the vectors (see Table~\ref{tb:phi_definitions}).
A $\phi$-divergence has an {\it adjoint}, defined by
\begin{equation} \label{eq:adjoint}
	\tilde{\phi}(t) = t \phi\left(\frac{1}{t}\right),
\end{equation}
which satisfies all criteria for a $\phi$-divergence \citep{bental1991certainty} and has the property that $I_{\tilde{\phi}}(\p,\q) = I_\phi(\q,\p)$.
Divergences that are symmetric with respect to the input vectors are known as {\it self-adjoint}.

An important function related to the $\phi$-divergence function is its  {\it  convex conjugate}, which is used, for instance, in the dual problem formulation (Section \ref{ssec:form}). 
We will also use the properties of the conjugate for our classification (Section \ref{sec:classification}). 
The conjugate $\phi^* : \R \rightarrow \R \cup \{\infty\}$ is defined as
\begin{equation} \label{eq:conjugate}
	\phi^*(s) = \sup_{t \geq 0} \{st - \phi(t)\}.
\end{equation}
It is a nondecreasing convex function, which may be undefined above some upper bound $\bar{s}$.
Because $\phi$ is a proper closed convex function, $\phi^{**}=\phi$ and $t \in \partial \phi^*(s)$ if and only if $s \in \partial \phi(t)$ \citep[Corollary 23.5.1]{rockafellar_70}. 
We will use the latter property in our analysis. 
 

Table \ref{tb:phi_definitions} lists some common examples of $\phi$-divergences, along with their adjoints and conjugates.
The value of the conjugate is listed only in its domain, i.e., $\{s : \phi^*(s) < \infty\}$.
Most of these $\phi$-divergences are widely used in statistics and information theory.
%Using $\phi$-divergences to model ambiguous probability distributions is an attractive approach because it uses the data directly---only those data points or scenarios of interest are used in the calculations.
%%These scenarios can come from direct observation, results of simulation, or from expert opinion that the decision maker would especially like to be robust against.
%Because the \plp\ depends only on these scenarios, the size of the problem is polynomial in the sample size, making the \plp\ computationally tractable.
Because many $\phi$-divergences are commonly used in statistics---e.g., to conduct goodness-of-fit tests \citep{pardo2005statistical}---they provide natural ways to deal with data and distributions. 
Consequently, using $\phi$-divergences can be more data driven.
For instance, many $\phi$-divergences use more distributional information than moments.  
Another advantage is that they form convex ambiguity sets.
This opens up the tools of convex analysis and allows computationally tractable models. 
Finally, they encompass a fairly large class of problems, including some important risk-averse optimization problems. 
In Section \ref{ssec:special_phi}, we present $\phi$-divergences that assign a distance of either $0$ or $\infty$, which result in commonly used risk models.
%; we will provide some examples in Section~\ref{ssec:special_phi}. 

Table \ref{tb:phi_definitions} lists a divergence, labeled ``Likelihood,'' that is somewhat different from the others.
The Likelihood divergence is equivalent to the Burg entropy when comparing probability vectors, but it does not satisfy the normalizing condition $\phi(t) \geq 0$.
This divergence is included because \citet{wang2010likelihood} use it to formulate a distributionally robust problem so that the ambiguity set of distributions have a sufficiently high empirical likelihood. 
They refer to this method as the Likelihood Robust Optimization. 
We also note that \cite{calafiore2007ambiguous} and \cite{hukullback} use a different naming convention than the one given here, referring to the Burg entropy as the Kullback-Leibler (KL) divergence---reversing the order of the arguments $\p$ and $\q$ relative to the notation presented here.
In this paper, $\q$ denotes the  nominal distribution.
%, which is the distribution found directly via data or a distribution that is believed to represent the 'true' distribution but with some uncertainty. 

\begin{table}
	\TABLE
	{
		Definitions of some common $\phi$-divergences, along with their adjoints $\tilde{\phi}(t)$ and conjugates $\phi^*(s)$
		\label{tb:phi_definitions}
	}
	{\begin{tabular}{lccccc}
		\hline \\
		Divergence                        & $\phi(t)$          & $\tilde{\phi}(t)$               & $\phi(t), t \geq 0$   & $I_\phi(p,q)$     & $\phi^*(s)$ \\
		\hline
		Kullback-Leibler                 & $\phi_{kl}$        & $\phi_b$                        & $t\log t - t + 1$     & $\sum p_\omega \log\left(\frac{p_\omega}{q_\omega}\right)$ & $e^s - 1$ \\
		Burg Entropy                      & $\phi_b$           & $\phi_{kl}$                     & $-\log t + t - 1$     & $\sum q_\omega \log\left(\frac{q_\omega}{p_\omega}\right)$ & $-\log(1-s),\ s < 1$  \\
		J-Divergence                      & $\phi_j$           & $\phi_j$                        & $(t-1)\log t$         & $\sum (p_\omega - q_\omega) \log\left(\frac{p_\omega}{q_\omega}\right)$ & No closed form \\
		Likelihood                        & $\phi_l$           & $t\log t $                      & $-\log t$             & $\sum q_\omega \log\left(\frac{q_\omega}{p_\omega}\right)$ & $-\log(-s) - 1,\ s < 0$ \\
		$\chi^2$-Distance                 & $\phi_{\chi^2}$    & $\phi_{m\chi^2}$                & $\frac{1}{t} (t-1)^2$ & $\sum \frac{(p_\omega-q_\omega)^2}{p_\omega}$              & $2 - 2\sqrt{1-s},\ s \leq 1$  \\
		Modified $\chi^2$-Dist.           & $\phi_{m\chi^2}$   & $\phi_{\chi^2}$                 & $(t-1)^2$             & $\sum \frac{(p_\omega - q_\omega)^2}{q_\omega}$            & $\begin{cases} -1 & s < -2 \\ s + \frac{s^2}{4} & s \geq -2 \end{cases}$ \\
% 		$\chi$-div,  $\theta > 1$ & $\phi_\chi^\theta$ & $t^{1-\theta}\phi_\chi^\theta$ & $|t-1|^\theta$         & $\sum q_\omega |1-\frac{p_\omega}{q_\omega}|^\theta$       & $\begin{cases} -1 & s \leq -\theta \\ s + (\theta-1)\left(\frac{|s|}{\theta}\right)^\frac{\theta}{\theta-1}  & s \geq -\theta \end{cases}$ \\
		Variation Distance                & $\phi_v$           & $\phi_v$                        & $|t-1|$               & $\sum |p_\omega - q_\omega|$                               & $\begin{cases} -1 & s \leq -1 \\ s & -1 \leq s \leq 1 \end{cases}$ \\
		Hellinger Distance                & $\phi_h$           & $\phi_h$                        & $(\sqrt{t} - 1)^2$    & $\sum (\sqrt{p_\omega} - \sqrt{q_\omega})^2$               & $\frac{s}{1-s},\ s < 1$ \\
	\hline
	\end{tabular}}
	{}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{$\phi$-Divergence Constrained Ambiguous Stochastic Program}
\label{sec:plp2}

%In this section we provide primal and dual formulations and basic properties of two-stage ambiguous stochastic linear programs constructed via $\phi$-divergences.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Formulation}
\label{ssec:form}

We begin with a two-stage stochastic linear program with recourse (SLP-2).
%, which we will soon present its corresponding \plp.
Let $\x$ be a vector of first-stage decision variables with cost vector $\c$, constraint matrix $\A$, and right-hand-side vector $\b$.
We assume the random parameters of the second-stage problem have a finite distribution with realizations indexed by $\omega = 1, \dots, n$ and probabilities denoted by $q_\omega$,  $\omega = 1, \dots, n$. 
%and probabilities denoted by...
We refer to realizations interchangeably as scenarios. 
The SLP-2 is given by
\begin{equation}
	\min_\x \ \left\{ \c\x + \sum_{\omega=1}^n q_\omega h_\omega(\x) : \ \A\x = \b, \x \geq 0 \right\}, \label{eq:slp_first_stage}
\end{equation}
where
\begin{align}
	h_\omega(\x) = \min_{\y^\omega} \ & \left\{ \g^\omega \y^\omega : \  \D^\omega \y^\omega = \B^\omega \x + \d^\omega, \y^\omega \geq 0 \right\}, \ \   \omega = 1, \dots, n. \label{eq:slp_second_stage}
\end{align}
For a given scenario $\omega$, the second-stage decision variables $\y^\omega$ are optimized with respect to cost vector $\g^{\omega}$. 
The second-stage constraints with recourse matrix $\D^{\omega}$ depend on the first-stage variables $\x$ through the technology matrix $\B^{\omega}$, which appear on the right-hand side of constraints along with $\d^{\omega}$. 
Throughout the rest of the paper, we denote the first-stage feasible region as $X = \{\x : \ \A\x = \b, \x \geq 0\}$.
The first stage of a prototypical SLP-2 allocates capacities to different electricity generators before demand and reliability of the generators are known. 
Then, once the electricity demand and generator reliabilities become known, the second stage provides electricity to demand sites in a least-costly fashion. 
We will consider this problem for our numerical results in Section ~\ref{sec:comp_results}.

%We assume relatively complete recourse; i.e., the second-stage problems (\ref{eq:slp_second_stage}) are feasible for every feasible solution $\x$ of the first-stage problem (\ref{eq:slp_first_stage}). 
%We also assume that the second-stage problems are dual feasible for every feasible solution $\x$ of the first-stage problem.


The SLP-2 formulation assumes that $\q$ is known.
However, in many applications, the distribution is itself unknown, and there is no reliable way to obtain the probabilities of scenarios $\omega$.
By replacing the specific distribution in SLP-2 with a set of distributions sufficiently close to the nominal distribution $\q$ with respect to a $\phi$-divergence, we create the \plp\ model.
In the \plp, the objective function is minimized with respect to the worst-case distribution selected from the ambiguity set of distributions.
The resulting minimax formulation of \plp\ is
\begin{equation}
	\min_{\x \in X} \max_{\p \in \mathcal{P}} \left\{ \c\x + \sum_{\omega=1}^{n} p_\omega h_\omega(\x) \right\}, \label{eq:plp_primal}
\end{equation}
where the ambiguity set is
\begin{align}
	\mathcal{P} = & \Bigg\{ \sum_{\omega = 1}^{n} q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right) \leq \rho, \label{eq:plp_primal_divergence} \\
	& \ \sum_{\omega=1}^{n} p_\omega = 1, \label{eq:plp_primal_probability} \\
	& \  p_\omega \geq 0,\ \forall \omega \Bigg\}. \label{eq:nonneg}
\end{align}
We refer to (\ref{eq:plp_primal_divergence}) as the $\phi$-divergence constraint, and (\ref{eq:plp_primal_probability}) and (\ref{eq:nonneg}) simply ensure a probability measure.
We will discus how to determine $\rho$ in (\ref{eq:plp_primal_divergence}) shortly in Section \ref{ssec:robust_level}.

Taking the Lagrangian dual of the inner maximization problem, with dual variables $\lambda$ and $\mu$, of constraints (\ref{eq:plp_primal_divergence}) and (\ref{eq:plp_primal_probability}), respectively, and combining the two minimizations gives \plp\ in dual form
\begin{align}
	\min_{\x,\lambda,\mu} \ & \c\x + \mu + \rho \lambda + \lambda \sum_{\omega=1}^{n} q_\omega \phi^*\left(\frac{h_\omega(\x) - \mu}{\lambda}\right) \label{eq:plp_two_stage} \\
	\st \ & \x \in X \nonumber \\
	& h_\omega(\x) - \mu \leq \left( \lim_{t \rightarrow \infty} \frac{\phi(t)}{t} \right) \lambda, \ \forall \omega \label{eq:plp_feas_constraint}\\
	& \lambda \geq 0. \nonumber
\end{align}
When $\lambda =0$, the last term in the objective function (\ref{eq:plp_two_stage}) has the following interpretations: $0\phi^*(b/0)=0$ if $b\leq 0$ and  $0\phi^*(b/0)=+\infty$ if $b > 0$.
%where $h_\omega(\x)$ and the second-stage problems are as given in (\ref{eq:slp_second_stage}). 
When $\rho>0$,  $\q$ strictly satisfies the $\phi$-divergence constraint $I_{\phi}(\q,\q)=0<\rho$. 
So, the Slater condition holds, and we have strong duality.

Some $\phi$-divergences, like the J-divergence, have no closed-form representation of $\phi^*$. 
However, they can be expressed as the sum of other $\phi$-divergences with closed-form conjugates. 
For example, sum of Burg Entropy and KL divergence gives the J-divergence. 
In this case, the dual can be formed similarly; see \cite{bental2013robust} for details.
Theorem 1 of \cite{bental2013robust} contains a derivation of the dual problem, which is reprinted as part of the proof of Proposition \ref{prop:pop}.


The right-hand side of (\ref{eq:plp_feas_constraint}) contains a limit.
This constraint results from a dual feasibility consideration. 
When this limit is finite, i.e., $\lim_{t \rightarrow \infty} \frac{\phi(t)}{t}=  \bar{s}<\infty$, then for any $s> \bar{s}$, $\phi^*(s)=\infty$. 
Therefore, $\phi$-divergences with a finite limit (like the variation distance) induce this constraint.
In (\ref{eq:plp_feas_constraint}), we moved $\lambda$ to the right-hand side to allow for $\lambda=0$. 
On the other hand, when this limit is $\infty$ (like the KL divergence), $\phi^*(s)<\infty$ for any finite value of $s$. 
In this case, 
%all values are dual feasible, and 
constraint (\ref{eq:plp_feas_constraint}) can be removed from the formulation. 
Observe, in particular, that the dual formulation is accurate even for $q_\omega = 0$ for some $\omega$.  
%(see proof of  Proposition \ref{prop:pop} and  Section~\ref{sec:classification} for more details).
We will discuss this case in more detail in Section~\ref{sec:classification} (e.g., proof of  Proposition \ref{prop:pop}).

%For some $\phi$-divergences, dant.
%However, other $\phi$-divergences, like the variation distance, have a finite limit, inducing this constraint.
%When this limit is $\infty$, on the other hand, $\phi^*(s)<\infty$ for any finite value of $s$. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Basic Properties of \plp}
\label{ssec:basicprop}

In this section we list some basic properties of \plp.
Some of these properties have already been noted earlier (e.g., by \citet{bental2013robust,bental_teboulle_07} and by others for specific $\phi$-divergences), but we list them for completeness.
These properties help with our specialized solution method and our classification of $\phi$-divergences, and we refer to them in later sections.
Throughout the rest of the paper, we use the notation
\begin{equation}
	s_\omega = \frac{h_\omega(\x) - \mu}{\lambda}. \label{eq:s_omega_definition}
\end{equation}
Furthermore, $(\x^*, \p^*)$ denotes the optimal primal solution, and $(\x^*, \lambda^*, \mu^*)$ denotes the optimal dual solution.
Optimal $s_\omega^*$ can then be found by plugging in the respective optimal solutions in (\ref{eq:s_omega_definition}). 
We assume $X \neq \emptyset$ and compact, and second-stage problems (\ref{eq:slp_second_stage}) are primal and dual feasible for all $\omega$ and all $\x \in X$. 
This ensures that $h_\omega(\x)$ are (finite) real-valued,  and both problems SLP-2 and \plp\ have finite optimal solutions.  
The basic properties of \plp\ are as follows.


\begin{property}
	\label{property:convex}
	\plp\ is a convex program.
\end{property}

\begin{property}
	\label{property:coherent_risk_measure}
	\plp\ is equivalent to minimizing a coherent risk measure.
\end{property}

\begin{property}
	\label{property:time_structure}
	\plp\ preserves the time structure of SLP-2.
\end{property}

\begin{property} {\sc (Primal-Dual Relation.)}
	\label{property:primal_dual_relation}
	The (optimal) worst-case probabilities $p^*_\omega$ can be calculated with the equations
%	\begin{align} 
	\begin{equation}\label{eq:p_worst}
		\frac{p_\omega^*}{q_\omega} \in \partial \phi^*\left(s_\omega^*\right), \ \ \ \ \ \sum_{\omega=1}^n q_\omega \phi\left(\frac{p_\omega^*}{q_\omega}\right) = \rho, \ \ \ \ \ \sum_{\omega=1}^n p_\omega^* = 1
	\end{equation}		
when $\lambda^*>0$ and $q_\omega >0$. 
	With $\lambda^*>0$ and $q_\omega =0$, $p_\omega^* \in \partial \phi^*\left(s_\omega^*\right) q_\omega$ (i.e., $p_\omega^* = 0$) when $s_\omega^* < \bar{s}$; otherwise (i.e., when $s_\omega^* = \bar{s}<\infty$) use the last two equations in (\ref{eq:p_worst}). 
	With $\lambda^*=0$, set $p_\omega^* = 0$ when $h_\omega(\x^*) - \mu^* < 0$; otherwise (when $h_\omega(\x^*) - \mu^* = 0$), use the last two equations in (\ref{eq:p_worst}), but with the regular $\phi$-divergence constraint $I_{\phi}(\p,\q)\leq \rho$ instead of $I_{\phi}(\p,\q)=\rho$. 
%
%$\phi$-divergence constraint (\ref{eq:plp_primal_divergence}) and $\sum_\omega p^*_\omega = 1$. 
\end{property}


A coherent risk measure---first proposed by \cite{Artzner_et_al_1999}, and later refined by several authors including \cite{rockafellar2007coherent,shaDR:09}---has desired properties including convexity and monotonicity.  
These properties, combined with the facts that $h_\omega(\x)$ is convex over $X$ and $X$ is a convex set, implies Property \ref{property:convex}.  
See also Proposition \ref{prop:convex} in Section \ref{sec:soln_algorithm} and \cite{bental2013robust}. 
Observe that Property \ref{property:coherent_risk_measure} is valid even when we have $q_\omega=0$ for some $\omega$.
The case of $q_\omega = 0$ plays an important role in the classification presented in Section \ref{sec:classification}. 
Therefore, we provide a proof of Property \ref{property:coherent_risk_measure} in this case in Appendix \ref{sec:apx_proof}. 
%In Section \ref{ssec:special_phi}, we present $\phi$-divergences that result in commonly used coherent risk measures. 


Property \ref{property:time_structure} helps with our decomposition-based solution method described in Section \ref{sec:soln_algorithm}. 
The preservation of time structure can be seen in (\ref{eq:plp_two_stage}).  
Rewriting it slightly, we obtain
\begin{equation}
\label{eq:dec}
\min_{\x \in X,\lambda \geq 0,\mu} \left\{ \c\x + \mu + \rho \lambda +  \sum_{\omega=1}^{n} q_\omega h_\omega^{\dagger}(\x, \lambda, \mu) \colon \ s_\omega \leq \bar{s} \right\}.
\end{equation}
The above formulation preserves the two-stage structure of the SLP-2.
The first-stage variables can now be viewed as $\x, \lambda$, and $\mu$. 
The expectation is taken using the nominal probability vector $\q$. 
Finally, $ h^{\dagger}_\omega(\x, \lambda, \mu) =  \lambda \phi^*\left(\frac{h_\omega(\x) - \mu}{\lambda} \right)$,
%or equivalently $\lambda\phi^*(s_\omega)$,
 where $h_\omega(\x)$ are defined as before. 
%In Section \ref{sec:soln_algorithm}, we decompose the problem by scenario and convert (sub-)gradients of $h_\omega(\x)$ to (sub-)derivatives of $\phi^*\left(s_\omega\right)$. 


Property \ref{property:primal_dual_relation} lists the first order necessary conditions for optimality. 
The appearance of the conjugate $\phi^*$ in the objective of (\ref{eq:plp_two_stage}) gives a method for retrieving the worst-case distribution from the dual problem.
It uses the fact that $\frac{p^*_\omega}{q_\omega} \in \partial \phi^*(s^*_\omega)$ if and only if $s^*_\omega \in \partial \phi\left(\frac{p^*_\omega}{q_\omega}\right)$.
In many cases, the first equation in (\ref{eq:p_worst}) is sufficient to calculate $p_\omega^*$.
In addition, $\phi^*$ is often differentiable, and so we have the relationship $p_\omega^* = q_\omega \phi^{* \prime}(s_\omega)$. 
Observe that because  $\phi$ is a proper closed convex function, so is $\phi^*$ \citep[Theorem 12.2]{rockafellar_70}.
Hence,  $\phi^*$ is subdifferentiable on the relative interior of its domain \citep[Theorem 23.4]{rockafellar_70}.  
The boundary of the domain of $\phi^*$---that is, at $s=\bar{s}$ when $\bar{s}<\infty$---might require special care, where we need the primal feasibility conditions (\ref{eq:plp_primal_divergence}) and (\ref{eq:plp_primal_probability}). 
The $\phi$-divergence constraint in (\ref{eq:p_worst}) is written as an equality because with $\lambda^*>0$, the complementary slackness dictates that this constraint must be active. 
With $\lambda^*=0$, we have the regular $\phi$-divergence constraint (\ref{eq:plp_primal_divergence}). 
While Property \ref{property:primal_dual_relation} summarizes how to obtain $p^*_\omega$ for the cases when $\lambda^* = 0$ or $q_\omega = 0$, we will discuss these special cases in more detail in Section \ref{sec:classification}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{The Level of Robustness}
\label{ssec:robust_level}

The literature on $\phi$-divergences provides some insight on choosing a reasonable asymptotic value of $\rho$ in the data-driven setting. 
In this setting, $\q$ is generated from observations, where scenario $\omega$ has been observed $N_\omega$ times with $N = \sum_{\omega=1}^n N_\omega$ total observations.
So, the nominal probability of scenario $\omega$ is set to be $q_\omega = \frac{N_\omega}{N}$.


When $\phi$ is twice continuously differentiable around $1$ with $\phi^{\prime \prime}(1)>0$, \citet[Theorem 3.1]{pardo2005statistical} shows that the statistic
\[
	T^\phi_N(\q^N,\qtrue) = \frac{2N}{\phi''(1)} I_{\phi}(\q^N, \qtrue)
%\sum_{\omega=1}^n \qtrue_\omega \phi\left(\frac{q^N_\omega}{\qtrue_\omega}\right)
\]
converges in distribution to a $\chi^2$-distribution with $n-1$ degrees of freedom, where $\q^N$ denotes the empirical distribution ($q^N_\omega = N_\omega/N$) and $\qtrue$ denotes the underlying true distribution.
Most $\phi$-divergences in Table~\ref{tb:phi_definitions} satisfy this differentiability condition.
\citet{bental2013robust} then use this result to suggest the asymptotic value
\begin{equation} \label{eq:asymptotic_rho}
	\rho = \frac{\phi''(1)}{2N} \chi^2_{n-1,1-\alpha},
\end{equation}
where $\chi^2_{n-1,1-\alpha}$ is the $1-\alpha$ percentile of a $\chi^2_{n-1}$ distribution. 
This choice of $\rho$ produces an approximate $1-\alpha$ confidence region on the true distribution.
To correct for small sample sizes and for more details, we refer the readers to \cite{pardo2005statistical} and \cite{bental2013robust}. 

%We are now ready to present the main contributions of this paper. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Data-Driven Properties}
\label{sec:properties}

In this section we assume the nominal probabilities $\q$ are the empirical probabilities $\q^{N}$---i.e., $q_\omega=q_\omega^N = {N_\omega}/{N}$---and provide insight into how the \plp\ changes as data is added.
First, we investigate how \plp\ might change with a single additional observation in Section \ref{ssec:value}. 
Next, we examine what happens as more and more data is gathered with asymptotic results in Section \ref{ssec:epiconvergence}.
This analysis must consider how the level of robustness $\rho$ changes as additional observations are obtained. 
Therefore, in this section, we use the notation $\rho_N$ to emphasize the dependence of $\rho$ on the number observations.
To be consistent with the known $\phi$-divergence results stated in Section \ref{ssec:robust_level}, we set $\rho_N = \frac{\rho_0}{N}$.
Observe that $\rho_0=\frac{\phi''(1)}{2} \chi^2_{n-1,1-\alpha}$ in (\ref{eq:asymptotic_rho}). 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Value of Additional Data} \label{ssec:value}

With a data-driven formulation such as \plp, 
%it is natural to ask how the optimal value and solution changes as more data is gathered.
%In particular, %for robust formulations like \plp\ 
one might be concerned about being overly conservative in the problem formulation, and thus, missing the opportunity to find a better solution to the true distribution.
For \plp, this means that the initial model is likely to be more conservative in an effort to be robust, while the new information could make the model less conservative.
This would happen, for instance, when the new information removes the current worst-case distribution from the ambiguity set.  
Below, we present a simple method of determining if taking an additional observation will allow for a lower-cost solution.
%Below, we present a simple method of determining if taking an additional sample will eliminate the old worst-case distribution and allow for a lower-cost solution.
%We also provide a way of estimating the probability of sampling such an observation.

%Our main goal is to come up with simple conditions on how an additional data affects the problem by using the current solution.
Our main goal is to come up with simple conditions by using the current solution.
In particular, we would like to use only the current optimal worst-case probabilities $p_\omega^*$, nominal probabilities $q_\omega$, and the number of observations $N$.
One could, of course, solve the problem with an additional observation of $\omega$, see if the optimal value is lowered, and check this for every scenario $\omega$. 
We would like to avoid resolving the problem. 
Toward this end, we first provide a general result. 
Using this general result, we then provide simpler conditions for a subset of the $\phi$-divergences by using only $\q$, $\p^*$ and $N$ in Corollary \ref{cor:cost_decrease_trick}. 

\begin{proposition}
	\label{prop:value}
	Let $(\x^*_N,\mu^*_N,\lambda^*_N)$ solve the $N$-observation (dual) problem with $q_\omega = \tfrac{N_\omega}{N}$.
	Suppose $s^*_\omega = \dfrac{h_\omega(\x^*_N) - \mu^*_N}{\lambda^*_N}$ is finite and $\phi^*$ is subdifferentiable at $s^*_\omega $ for all $\omega=1,\ldots,n$.
	An additional observation of scenario $\hat{\omega}$ will result in a decrease in the worst-case expected cost of \plp\ if the following condition is satisfied
	\begin{equation} \label{eq:cost_decrease_cond}
		\sum_{\omega=1}^n q_\omega \phi^{*\prime}\left(\frac{N}{N+1}s^*_\omega\right) \left(\frac{N}{N+1}s^*_\omega\right) > \phi^*\left(\frac{N}{N+1}s^*_{\hat{\omega}}\right).
	\end{equation}
	In (\ref{eq:cost_decrease_cond}), $\phi^{*\prime}(s)$ denotes the derivative of $\phi^{*}$ at $s$ if it is differentiable, and it denotes a subgradient of $\phi^{*}$ at $s$ otherwise. 
\end{proposition}

\begin{proof}{\sc Proof of Proposition \ref{prop:value}.}
	For ease of exposition, assume $\phi^*$ is differentiable.
%	The proof works without this assumption by considering a subgradient.
	We begin the proof with the change of variables $\kappa = \frac{\lambda}{N}$ and note that $N\rho_N = (N+1)\rho_{N+1} =\rho_0$ is constant.	
	With this change of variables, the objective function of the $N$-observation problem is given by
	\[
		f_N(\x,\mu,\kappa) = c\x + \mu + \rho_0 \kappa + \sum_{\omega = 1}^n N_\omega \left[ \kappa \phi^*\left(\frac{h_\omega(\x) - \mu}{\kappa N} \right) \right].
	\]
	Let $z_N^*$ be the optimal value and  $(\x^*_N,\mu^*_N,\kappa^*_N)$ be the optimal solution of the $N$-observation dual problem (\ref{eq:plp_two_stage}) with the change of variables.
% and objective function $f_N(\x,\mu,\kappa)$.
%$ = \min_{\x \in X,\mu,\kappa \geq 0} f_N(\x,\mu,\kappa)$.
	We wish to find a simple estimate of the decrease in the optimal cost $z_N^* - z_{N+1}^*$ associated with taking an additional observation of a specific scenario $\hat{\omega}$.
In particular, we look for a condition under which $z_N^* - z_{N+1}^* > 0$.	
%	Let $(\x^*_N,\mu^*_N,\kappa^*_N)$ minimize $f_N$.
	Notice that $(\x^*_N,\mu^*_N,\kappa^*_N)$ is a feasible but not necessarily an optimal solution to the $(N+1)$-observation problem.
%with an additional observation of scenario $\hat{\omega}$. 
	Then, $z_N^* - f_{N+1}(\x^*_N,\mu^*_N,\kappa^*_N)$ provides a lower bound on the decrease in optimal cost $z_N^* - z_{N+1}^*$.
	We will find scenarios $\hat{\omega}$ such that $z_N^* - f_{N+1}(\x^*_N,\mu^*_N,\kappa^*_N) > 0$.

	The objective of the $(N+1)$-observation problem for a given $(\x,\mu,\kappa)$ is $ f_{N+1}(\x,\mu,\kappa) = \c\x + \mu + \rho_0 \kappa + \sum_{\omega = 1}^n N'_\omega \left[ \kappa \phi^*\left(\frac{h_\omega(\x) - \mu}{\kappa (N+1)} \right) \right]$,	where $N'_\omega$ is the number of observations of scenario $\omega$ after $N+1$ total observations (e.g., $N_{\hat{\omega}}=N_{\hat{\omega}}+1$ and $N'_\omega = N_\omega$ for others).
	Then, $z_N^* - z_{N+1}^*$ is bounded by $\kappa \sum_{\omega=1}^n \left[ N_\omega \phi^*\left(\frac{h_\omega(\x) - \mu}{\kappa N} \right) - N'_\omega \phi^*\left(\frac{h_\omega(\x) - \mu}{\kappa (N+1)} \right) \right]$, which must be positive to guarantee a drop in optimal cost.
	If $\hat{\omega}$ is the next scenario observed, we can rewrite this condition as
	\begin{equation} \label{eq:raw_cond}
		\kappa \sum_{\omega=1}^n N_\omega \left[ \phi^*\left(\frac{h_\omega(\x) - \mu}{N\kappa} \right) - \phi^*\left(\frac{h_\omega(\x) - \mu}{(N+1)\kappa} \right) \right] - \kappa \phi^*\left(\frac{h_{\hat{\omega}}(x) - \mu}{(N+1)\kappa}\right) > 0.
	\end{equation}

	Let $s^N_\omega = \frac{h_\omega(\x) - \mu}{\kappa N}$ and $s^{N+1}_\omega = \frac{h_\omega(\x) - \mu}{\kappa (N+1)}$, and note that $s^{N+1}_\omega = \tfrac{N}{N+1} s^N_\omega$.
%	Let $|\Delta s| = | s^{N+1}_\omega - s^N_\omega | = \frac{|s^{N+1}_\omega|}{N}$.
	The difference $\phi^*(s^N_\omega) - \phi^*(s^{N+1}_\omega)$ will be approximated by the derivative.
%	Note that $\phi^*(s)$ is nondecreasing because $0 \leq t = \phi^{*\prime}(s)$ for some $t$.
%	First, for $s^N_\omega > 0$, $\phi^*(s^N_\omega) - \phi^*(s^{N+1}_\omega) \geq \phi^{*\prime}(s^{N+1}_\omega) |\Delta s|$.
%	Then for $s^N_\omega < 0$, $\phi^*(s^{N+1}_\omega) - \phi^*(s^N_\omega) \leq \phi^{*\prime}(s^{N+1}_\omega) |\Delta s|$ and thus $\phi^*(s^N_\omega) - \phi^*(s^{N+1}_\omega) \geq -\phi^{*\prime}(s^{N+1}_\omega) |\Delta s|$.
%	Then both cases reduce to $\phi^*(s^N_\omega) - \phi^*(s^{N+1}_\omega) \geq \frac{1}{N} \phi^{*\prime}(s^{N+1}_\omega) s^{N+1}_\omega$.
Because $\phi^*(s)$ is convex, the gradient inequality gives $\phi^*(s^N_\omega) - \phi^*(s^{N+1}_\omega) \geq \frac{1}{N} \phi^{*\prime}(s^{N+1}_\omega) s^{N+1}_\omega$. 
	Using this inequality, we can guarantee (\ref{eq:raw_cond}) is satisfied with the condition $\kappa \sum_{\omega=1}^n \frac{N_\omega}{N} \phi^{*\prime}(s^{N+1}_\omega) s^{N+1}_\omega - \kappa \phi^*\left(\frac{h_{\hat{\omega}}(\x) - \mu}{(N+1)\kappa}\right) > 0$.
	 Rearranging and dividing by $\kappa > 0$
	\begin{equation*} %\label{eq:main_value_derivation}
		\sum_{\omega=1}^n \frac{N_\omega}{N} \phi^{*\prime}(s^{N+1}_\omega) s^{N+1}_\omega > \phi^*(s^{N+1}_{\hat{\omega}}).
	\end{equation*}
	Finally, we return to the original variables with the substitution $s^{N+1}_\omega = \frac{N}{N+1} s^*_\omega, \forall \omega$. 
	\Halmos
\end{proof}
%\end{proposition}

We can interpret \eqref{eq:cost_decrease_cond} as follows. 
If an additional observation is taken from the unknown distribution and the resulting observed scenario $\hat{\omega}$ satisfies (\ref{eq:cost_decrease_cond}) with the current solution, then the $(N+1)$-observation problem will have a lower cost than the $N$-observation problem that was already solved.
%This is equivalent to saying that an additional observation of $\hat{\omega}$ will rule out the computed worst-case distribution given by $\{p_\omega\}_{\omega=1}^{n}$ in \eqref{eq:p_worst}.
Observe that the statement of Proposition \ref{prop:value} eliminates the case $\lambda_N^* = 0$. 
A closer look at (\ref{eq:raw_cond}) and the version of (\ref{eq:raw_cond}) after the use of the gradient inequality, and recalling that $\kappa = \lambda /N$, reveals that the condition (\ref{eq:cost_decrease_cond}) is not satisfied at $\lambda_N^* = 0$.  

%\bigskip 
%	Hence,  $\phi^*$ is subdifferentiable on the relative interior of its domain. %%%\citep[Theorem xxx]{rockafellar_70}.  
%	The assumption that $s^*_\omega$ is finite together with constraint (\ref{eq:plp_feas_constraint}) ensures that $\frac{N}{N+1}s^*_\omega$ is in the relative interior of the domain of $\phi^{*}$; so a subgradient exists. 
%\bigskip 

It is possible to simplify condition \eqref{eq:cost_decrease_cond} for some $\phi$-divergences, and we detail this in the corollary below. 
Condition \eqref{eq:cost_decrease_cond} uses the dual formulation and (sub)gradient $\phi^{* \prime}$. 
This allows us to utilize the primal-dual relationship $\left(p_{\hat{\omega}}^* = \phi^{* \prime}(s_{\hat{\omega}}^*) q_{\hat{\omega}}\right)$ to provide simplified conditions using $\q,\p^*$ and $N$. 


%\begin{remark}
%	\label{rmk:cost_decrease_trick}
\begin{corollary}
	\label{cor:cost_decrease_trick}
	Let $\p^*=(p_1^*, p_2^*,\ldots, p_n^*)$ solve the $N$-observation (primal) problem with $q_\omega = \tfrac{N_\omega}{N}$.
	An additional observation of scenario $\hat{\omega}$ will result in a decrease in the worst-case expected cost of \plp\ if the following condition is satisfied for:\vspace*{-0.1in}
	\begin{multicols}{2}
		\begin{description}
			\item[Burg entropy:] $\frac{p_{\hat{\omega}}^*}{q_{\hat{\omega}}} < \frac{N}{N+1}$, %(or, $p_{\hat{\omega}}<\frac{N_{\hat{\omega}}}{N}$)
			\item[$\chi^2$-distance:]  $\sum_\omega \frac{(q_\omega)^2}{p_\omega^*} + \sqrt{\frac{N+1}{N}} < 2 \frac{q_{\hat{\omega}}}{p_{\hat{\omega}}^*}$,
			\item[Hellinger:] $\sum_\omega q_\omega \sqrt{\frac{p_\omega^*}{q_\omega}} + \sqrt{\frac{p_{\hat{\omega}}^*}{q_{\hat{\omega}}}} < 2 \frac{N}{N+1}$,
			\item[Modified $\chi^2$:] $2 \sum_\omega \frac{(p_\omega^*)^2}{q_\omega} > \left(\frac{p_{\hat{\omega}}^*}{q_{\hat{\omega}}}\right)^2 + \left(\frac{N+1}{N}\right)^2$.
		\end{description}
	\end{multicols}
\end{corollary}

\begin{proof}{\sc Proof of Corollary \ref{cor:cost_decrease_trick}.}
%	We use a small trick to transform condition (\ref{eq:cost_decrease_cond}) into a form that is easier to work with.
	%Recall that 
	For any real number $c$, we can define $\phi_c(t) = \phi(t) + c(t-1)$, which satisfies $I_{\phi_c}(\p,\q) = I_\phi(\p,\q)$ for probability vectors $\p$ and $\q$.
	This changes the conjugate as $\phi_c^*(s) = \phi^*(s-c) + c$.
	For some $\phi$, we can choose $c$ such that $\phi_c^{*\prime}(s)$ is separable, i.e., $\phi_c^{*\prime}(as) = f(a) \phi_c^{*\prime}(s)$ for some function $f$.
	Using this separability, we can simplify (\ref{eq:cost_decrease_cond}) for some $\phi$ by choosing:\vspace*{-0.15in}
	\begin{multicols}{2}
	\begin{description} 
		\item[Burg entropy:]  $c = -1$, so $\phi^{*\prime}_{b,c}(s) = -\frac{1}{s}$,  $s<0$,
		\item[$\chi^2$-distance:] $c = -1$, so $\phi^{*\prime}_{\chi^2, c}(s) = \frac{1}{\sqrt{-s}}$, $s<0$,
		\item[Hellinger:]  $c = -1$ so $\phi^{*\prime}_{h,c}(s) = \frac{1}{s^2}$,  $s<0$,
		\item[Modified $\chi^2$:] $c = 2$, so
		\[
			\phi^{*\prime}_{m\chi^2, c}(s) = \
			\begin{cases}
				0 					&  s < 0 \\
				\frac{1}{2} s 	&  s \geq 0.
			\end{cases}
		\]
	\end{description}
	\end{multicols}
	\noindent We illustrate the rest of the steps using Burg entropy. 
	Because $I_{\phi_c}(\p,\q) = I_\phi(\p,\q)$, we can equivalently solve \plp\ by using $\phi_c$ instead. 
	Applying (\ref{eq:cost_decrease_cond}) to $\phi_{b,c}$, after some algebra, we obtain the simplified condition $- \left( \frac{N}{N+1} \right)s_{\hat{\omega}}^* >1$ for Burg entropy. 
	Property \ref{property:primal_dual_relation} yields the relationship $\frac{p^*_{\hat{\omega}}}{q_{\hat{\omega}}}=\frac{-1}{s_{\hat{\omega}}^*}$. 
	Substituting this into the simplified condition gives the desired result. 
	Modified $\chi^{2}$ must consider the split at $0$. 
	This can be easily handled. 
	The left-hand side of (\ref{eq:cost_decrease_cond}) contains the term  $\phi^{*\prime}_{m\chi^2, c}(s_{\omega}^*)s_{\omega}^*$, which is equal to $2\left(\frac{p_{\omega}^*}{q_\omega}\right)^2$.  
	Both terms equal to $0$ if $s_{\omega}^* < 0$ and $(s_{\omega}^*)^2/2$ otherwise.  
	The right-hand side can be handled similarly, resulting in the final condition presented above. 
	\Halmos
\end{proof}

Let us take a closer look into Corollary~\ref{cor:cost_decrease_trick}'s condition for Burg entropy. 
Recall that using Burg entropy, we obtain the likelihood robust optimization of \cite{wang2010likelihood}.
Furthermore, it has been used by some authors as the KL divergence because of the change between $\p$ and $\q$. 
In this case, we have the condition $p_{\hat{\omega}}^* < \frac{N}{N+1}q_{\hat{\omega}}$.  
This means that the \plp\ has assigned a worst-case probability  $p_{\hat{\omega}}^*$ that is less than the slightly adjusted observed frequency $q_{\hat{\omega}}=\frac{N_{\hat{\omega}}}{N}$ of scenario $\hat{\omega}$. 
The \plp\ focuses on the worst-case cost within the ambiguity set. 
Therefore, it tends to assign higher probabilities $p_\omega^*$ to costly 
scenarios. 
Because $p_{\hat{\omega}}^* < \frac{N}{N+1}q_{\hat{\omega}}$, the condition in Corollary~\ref{cor:cost_decrease_trick} suggests that $\hat{\omega}$ might not be a very costly scenario. 
If we observe one more from this scenario, we believe it is more likely in the nominal (or true) distribution. 
Consequently, a lower-cost scenario being more likely would decrease the optimal cost. 


The simple conditions in Proposition~\ref{prop:value} and Corollary~\ref{cor:cost_decrease_trick} provide insight into different scenarios for a decision maker. 
Let $L = \left\{ \hat{\omega} : \sum_{\omega=1}^n q_\omega \phi^{*\prime}\left(\frac{N}{N+1}s^*_\omega\right) \left(\frac{N}{N+1}s^*_\omega\right) > \phi^*\left(\frac{N}{N+1}s^*_{\hat{\omega}}\right) \right\}$.
%That is, $L$ gives the set of scenarios that, if sampled one more observation, would result in a decrease in the optimal cost in \plp.  
Set $L$ divides the scenarios into two---the ones in $L$ guarantee a drop in the overall cost if sampled one more.
Therefore, these scenarios can be considered `good' or `optimistic' scenarios. 
Note that scenarios not in $L$ can also result in the cost decrease. 
The numerical experiments in Section \ref{sec:comp_results} suggest that $L$ is an adequate indicator of `good' scenarios for our test problem. 
 

Finally, one might be interested in obtaining a lower bound on the probability that the next sample will decrease the optimal cost. 
An approximate lower bound on the probability of selecting a sample in $L$ can be found by solving
\begin{equation}
	\min_{r} \left\{ \sum_{\omega \in L} r_\omega \colon\ r \in \mathcal{P} \right\}. \label{eq:lb_probability}
\end{equation}
Because we do not know the true distribution, we find the minimum probability of the scenarios in $L$ within the ambiguity set defining \plp.
One can solve (\ref{eq:lb_probability}) by taking its dual.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Asymptotic Analysis}
\label{ssec:epiconvergence}

We now show that \plp\ behaves essentially the same as the corresponding SLP-2 with the (unknown) true distribution $\qtrue$ as $N\rightarrow \infty$. 
%We now wish to show that the optimal value and solution of \plp\ converges to the optimal value and solution of the corresponding SLP-2 with the (unknown) true distribution $\qtrue$.
%I HAD THIS BELOW SENTENCE BUT REMOVED IT. 
This requires that the sequence of nominal probabilities $\q$ converge to $\qtrue$ with probability one (w.p.1) uniformly in $\omega$---a situation that is satisfied by the assumed empirical probabilities under mild conditions.
To emphasize the nominal distribution's dependence on $N$, we use $\q^N$ in this section. 
%In the proof, we assume $q^N = q^N^N$.
We begin by showing that the worst-case probabilities $\p^*$ obtained by solving  \plp\ have a similar asymptotic behavior as $\q^N$. 
%%converge to the true distribution $\qtrue$ as $N \rightarrow \infty$ uniformly w.p.1. 
%This result shows that  $\p^*$ also obeys a Uniform Strong Law of Large Numbers (USSLN). 


\begin{proposition} \label{prop:weak_conv}
	Suppose $\phi(t) \geq 0$ has a unique root at $t = 1$ and the observations are independent and identically distributed from a distribution with probability mass function $\qtrue$. 
%in a way that $q^N_\omega$ obeys the Strong Law of Large Numbers (SLLN) for each $\omega$. 
	Then, w.p.1, $\sup_\omega |p^*_{\omega} - q^{\text{true}}_\omega| \rightarrow 0$ as $N \rightarrow \infty$. 
	%Let $p \neq \qtrue$.
\end{proposition}

\begin{proof}{\sc Proof of Proposition \ref{prop:weak_conv}.}
	Because $q_\omega^N=\frac{N_\omega}{N}$ obeys the strong law of large numbers and $\omega =1,...,n$ is a finite set, we have uniform convergence over $\omega$. 
	That is, $\sup_\omega |q^N_{\omega} - q^{\text{true}}_\omega| \rightarrow 0$ as $N \rightarrow \infty$, w.p.1.
	Now, on a sample path that this convergence occurs, we will show that for all $\epsilon > 0$, there exists $N'$  (depending on the sample path) such that $\forall N \geq N'$ (on that path), $I_{\phi}(\p^*,\q^N) \leq \frac{\rho_0}{N}$ implies $\sup_\omega |p^*_\omega - q^{\text{true}}_\omega| \leq \epsilon$.
	Because such sample paths have measure 1, our desired result will occur w.p.1. 
	Below, for simplicity of notation we skip the dependence on a particular sample path.


	First, note that $\sup_\omega |p_\omega^* - q^{\text{true}}_\omega| \leq \sup_\omega |p_\omega^* - q^N_\omega| + \sup_\omega |q^N_\omega - q^{\text{true}}_\omega|$.
	Assume, again for simplicity, $\epsilon$ is chosen so that $\min_{\omega} q^{\text{true}}_\omega > \frac{\epsilon}{2}$.
	Let $N''$ be such that $\sup_\omega |q^N_\omega - q^{\text{true}}_\omega| \leq \frac{\epsilon}{2}$ for all $N \geq N''$.
	This implies $q^N_\omega>0$ for all $N \geq N''$ for all $\omega$.
	Now suppose  $\sup_\omega |p_\omega^* - q^N_\omega| > \frac{\epsilon}{2}$. 
	Then, for at least one $\omega$---let's denote it $\ob$---we have either $p_{\ob}^* >q_{\ob}^N + \frac{\epsilon}{2}$ or $p_{\ob}^* < q_{\ob}^N - \frac{\epsilon}{2}$.
	In either case, because $\phi(t)\geq 0$ is a convex function with a root at $t=1$, we can say for $\ob$ 
$$
\phi \left( \frac{p_{\ob}^*}{q^N_{\ob}} \right) \geq \min\left\{ \phi\left( \frac{q^N_{\ob}+\tfrac{\epsilon}{2}}{q^N_{\ob}} \right), \phi\left( \frac{q^N_{\ob}-\tfrac{\epsilon}{2}}{q^N_{\ob}} \right) \right\} \geq \min\left\{ \phi\left( 1+\frac{\epsilon}{2}\right), \phi\left( 1-\frac{\epsilon}{2}\right) \right\}. 
$$  
	The last inequality follows from the fact that $\frac{a+\frac{\epsilon}{2}}{a} \geq 1+ \frac{\epsilon}{2}$ and $\frac{a-\frac{\epsilon}{2}}{a} \leq 1- \frac{\epsilon}{2}$ for $0<a\leq 1$ and again the properties of $\phi$. 
	Putting this all together, 
%	under the set forth assumptions, we have
	\begin{align}
		I_{\phi}(\p^*,\q^N) & = \sum_{\omega=1}^n q^N_\omega \phi\left( \frac{p_\omega^*}{q^N_\omega} \right) \nonumber \\
%		& = \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega + \sum_{\omega \notin Z} q^N_\omega \phi\left( \frac{p_\omega}{q^N_\omega} \right) \nonumber \\
		& \geq  \min_{\omega} \{q^N_\omega\} \cdot  \phi \left( \frac{p_{\ob}^*}{q^N_{\ob}} \right)  \nonumber \\
%		& \geq \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega  + \min_{\omega \notin Z} \{q^N_\omega\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\} \label{eq:asymptotic_proof_phi_substitution} \\
		& \geq  \min_{\omega} \left\{ q^{\text{true}}_\omega - \frac{\epsilon}{2} \right\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\} \label{eq:last}.
	\end{align}
	The right-hand side of (\ref{eq:last}) is positive because $\phi$ has a unique root at $t=1$. 
	By choosing $N'\geq N''$ to satisfy $\min_{\omega} \left\{ q^{\text{true}}_\omega - \frac{\epsilon}{2} \right\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\}\geq \frac{\rho_0}{N'}$, we see that $\sup_\omega |p_\omega^* - q^N_\omega| > \frac{\epsilon}{2}$ implies $I_\phi(\p^*,\q) > \frac{\rho_0}{N}$ for all $N \geq N'$. 
	Because the $\phi$-divergence constraint ensures $I_\phi(\p^*,\q) \leq \frac{\rho_0}{N}$, we must have $\sup_\omega |p_\omega^* - q^N_\omega| \leq \frac{\epsilon}{2}$ for all $N \geq N'$, and the desired result follows. 
%\bigskip 
%\bigskip 
%\bigskip 
%
%
%	This means that for all $N \geq N'$,  $\sup_\omega |p_\omega - q^N_\omega| > \frac{\epsilon}{2}$ implies that $I_\phi(\p,\q) > \frac{\rho_0}{N}$.
%
%\bigskip 
%
%	For simplicity, we assume $\epsilon$ is chosen so that $\max_{\omega \notin Z} \qtrue_\omega > \frac{\epsilon}{2}$ and drop the dependence on $\xi \in \Xi'$.	
%
%	To complete the proof, we will show that one can choose $N' \geq N''$ such that $\forall N \geq N'$, $\max_\omega |p_\omega - q^N_\omega| > \frac{\epsilon}{2} \Rightarrow I_\phi(p,q) > \frac{\rho_0}{N}$.
%	First, bound the divergence by
%	\begin{align}
%		I_{\phi}(p,q^N) & = \sum_{\omega=1}^n q^N_\omega \phi\left( \frac{p_\omega}{q^N_\omega} \right) \nonumber \\
%		& = \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega + \sum_{\omega \notin Z} q^N_\omega \phi\left( \frac{p_\omega}{q^N_\omega} \right) \nonumber \\
%		& \geq \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega + \min_{\omega \notin Z} \{q^N_\omega\} \cdot \max_{\omega \notin Z} \left\{ \phi \left( \frac{p_\omega}{q^N_\omega} \right) \right\} \nonumber \\
%		& \geq \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega  + \min_{\omega \notin Z} \{q^N_\omega\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\} \label{eq:asymptotic_proof_phi_substitution} \\
%		& \geq \bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega + \min_{\omega \notin Z} \left\{ \qtrue_\omega - \frac{\epsilon}{2} \right\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\} \nonumber,
%	\end{align}
%	where $\bar{s}\mathbb{I}_{\bar{s} < \infty}$ is the indicator function taking value $\bar{s}$ if $\bar{s} < \infty$ (i.e., if $\phi$ can pop scenarios---please see Section~\ref{ssec:suppressandpop} for details) and zero otherwise.	
%	Inequality (\ref{eq:asymptotic_proof_phi_substitution}) is true because $\phi \left( \frac{p_\omega}{q^N_\omega} \right) \geq \min\left\{ \phi\left( \frac{q^N_\omega+\tfrac{\epsilon}{2}}{q^N_\omega} \right), \phi\left( \frac{q^N_\omega-\tfrac{\epsilon}{2}}{q^N_\omega} \right) \right\}$ for at least one $\omega$ and applying the inequalities $\frac{a+\eta}{a} \geq 1 + \eta$ and $\frac{a-\eta}{a} \leq 1-\eta$.
%	
%	Finally, choose $N'$ to satisfy $\bar{s} \mathbb{I}_{\bar{s} < \infty} \sum_{\omega \in Z} p_\omega + \min_{\omega \notin Z} \left\{ \qtrue_\omega - \frac{\epsilon}{2} \right\} \cdot \min\left\{ \phi\left(1+\frac{\epsilon}{2}\right), \phi\left(1-\frac{\epsilon}{2}\right) \right\} \geq \frac{\rho_0}{N'}$.
	\Halmos
% 
%
% This is the stuff that was before the proposition 
%	Let $(\Xi,{\cal F},\P^\infty)$ be the probability space associated with taking infinitely many random samples from the distribution $\qtrue$.
%Let $\Xi' \subset \Xi$ be a measure 1 set such that $\Vert q^N(\xi) - \qtrue \Vert_\infty \rightarrow 0$.
%\bigskip 
%
%	For all $\epsilon > 0$ and $\xi \in \Xi'$, there exists $N'$ such that $\forall N \geq N'$, $I_{\phi}(p,q^N(\xi)) \leq \frac{\rho_0}{N}$ implies $\max_\omega |p_\omega - \qtrue_\omega| \leq \epsilon$.
%\bigskip 
%
%
%	Let $Z = \{\omega : \qtrue_\omega = 0\}$ be the set of impossible scenarios.
%
\end{proof}

We are now ready to present the main result on the asymptotic behavior of \plp. 


\begin{theorem}
	\label{thm:epiconvergence}
%	Assume $X \neq \emptyset$ is compact and problem (\ref{eq:slp_second_stage}) is primal and dual feasible for all $\omega$ and $\x \in X$.
	Suppose $\phi(t) \geq 0$ has a unique root at $t = 1$ and the observations are independent and identically distributed from a distribution with probability mass function $\qtrue$. 
	Then, the optimal value of \plp\ given in (\ref{eq:plp_two_stage}) converges to that of SLP-2 given in (\ref{eq:slp_first_stage}) with $\qtrue$, and all limit points of the solutions $\x^*$ of \plp\ solve SLP-2 with $\qtrue$ as $N \rightarrow \infty$, w.p.1.
\end{theorem}

\begin{proof}{\sc Proof of Theorem \ref{thm:epiconvergence}.}
	Assumptions on problems (\ref{eq:slp_second_stage}) and set $X$ stated in Section \ref{ssec:basicprop} ensure $\sup_{\omega, \x \in X}|h_\omega(\x)|<C$ for some constant $C<\infty$ and that SLP-2 with $\qtrue$ and  \plp\ w.p.1 have finite optimal solutions.  
   Let $f(\x,\omega)=\c\x + h_\omega(\x)$. 
	View the objective of \plp\ as $\ee{\p^*}{f(\x,\omega)}=\sum_{\omega=1}^{n}p^*_{\omega,N}(\x) f(\x,\omega)$ and the objective of SLP-2 with $\qtrue$ as $\ee{\qtrue}{f(\x,\omega)}=\sum_{\omega=1}^{n}q^{\text{true}}_\omega f(\x,\omega)$. 
	Here, $\p^*$ depends on the number of observations $N$, the actual observations collected, and also $\x$. 
	So, we use the longer notation $p^*_{\omega,N}(\x)$ inside the summation for clarity.
	Following the same arguments as in the proof Proposition \ref{prop:weak_conv}, we have for each $\x \in X$, $\sup_\omega |p^*_{\omega, N}(\x) - q^{\text{true}}_\omega| \rightarrow 0$ as $N \rightarrow \infty$, w.p.1. 
	Using this result, we can obtain pointwise strong law of large numbers (SLLN)---that is, for each $\x  \in X$, $|\ee{\p^*}{f(\x,\omega)}-\ee{\qtrue}{f(\x,\omega)}|\rightarrow 0$ as $N\rightarrow \infty$ w.p.1. 
	Because $X$ is convex and compact, $f(\x,\omega)$ is convex and continuous on $X$ for all $\omega$, and $\ee{\qtrue}{f(\x,\omega)}$ is finite valued and continuous on $X$, the desired result follows (see, e.g., Theorem 4 of \cite{shapiro_03}). 
    \Halmos
\end{proof}


The non-negativity condition on $\phi$ in Proposition \ref{prop:weak_conv} and Theorem \ref{thm:epiconvergence} are satisfied by every divergence in Table \ref{tb:phi_definitions} except Likelihood (which, however, can be rewritten as Burg Entropy).
The unique root requirement, on the other hand, is violated for the special cases to be introduced in Section \ref{ssec:special_phi}.
%Proposition \ref{prop:weak_conv} implies that the worst-case distributions of (\ref{eq:plp_primal}) converge weakly to $\qtrue$, which is used to show the desired result below. 
%In the next theorem, we establish the proof that the optimal value and solution of \plp\ converges to that of the SLP-2 with distribution $\qtrue$ by establishing the epiconvergence of \plp\ to SLP-2.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{A Classification of $\phi$-Divergences}
\label{sec:classification}

Given that there are many $\phi$-divergences to choose from, it is important to study how $\phi$-divergences act within an ambiguous (or, distributionally robust) stochastic optimization model. 
We present a classification of $\phi$-divergences into four types, resulting from an examination of the limiting behavior of $\phi(t)$ as $t \searrow 0$ and $t \nearrow \infty$.
We begin in Section~\ref{ssec:suppressandpop} by defining two behaviors---suppressing and popping of scenarios---that result in our main classification. 
Additional details on these behaviors along with a subclassification follow in Sections~\ref{ssec:suppress} and \ref{ssec:pop}. 
Different classifications may be suitable to different problem types and desired qualities in the ambiguous model.
We discuss modeling considerations with respect to our classification in Section \ref{ssec:modeling}, and we demonstrate the classification on risk models in Section \ref{ssec:special_phi}.
Throughout this section, we assume $0<\rho<\infty$, unless otherwise stated. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Suppressing and Popping of Scenarios: A Main Classification}
\label{ssec:suppressandpop}

Recall the definition of the ambiguity set $\mathcal{P}$, in particular, the $\phi$-divergence constraint 
\[
\sum_{\omega = 1}^{n} q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right) \leq \rho. 
\]
Above, $\phi$ has arguments given by ratios of probabilities, $\tfrac{p_\omega}{q_\omega}$. 
The case $q_\omega > 0$, $p_\omega = 0$ means that a scenario with a positive nominal probability has been assigned zero probability by the ambiguous counterpart problem. 
This case corresponds to the limit of $\phi$ as $t \searrow 0$. 
On the other hand, the case  $q_\omega = 0$, $p_\omega > 0$ could mean that scenario $\omega$ has never been observed before---so it has a zero probability in the nominal distribution---but the ambiguous counterpart problem has assigned it a positive probability. 
Recall that, by definition, $0 \phi(a/0) = a \lim_{t \rightarrow \infty} \frac{\phi(t)}{t}$, and so in the latter case, we need to examine $\lim_{t \nearrow \infty} \frac{\phi(t)}{t}$.  


Consider each of the limiting cases in more detail:
\begin{itemize}
	\item {\sc Case 1:} ($q_\omega > 0$ but $p_\omega = 0$)
		We call this the ``{\bf Suppressing}'' behavior because a scenario with a positive probability in the nominal distribution can take zero probability in the ambiguous problem. 
		In other words, such a scenario can be suppressed. 
%In this case we need to examine $\lim_{t \searrow 0} \phi(t)$:
	\begin{itemize}
		\item If $\lim_{t \searrow 0} \phi(t) = \infty$, the ambiguity region will never contain distributions with $p_\omega = 0$ but $q_\omega > 0$.
			We say that such a  $\phi$-divergence \emph{cannot suppress} scenarios.

		\item  If $\lim_{t \searrow 0} \phi(t) < \infty$, the ambiguity region could contain such a distribution, provided $q_\omega$ is sufficiently small and/or $\rho$ is sufficiently large.
			We say that such a $\phi$-divergence  \emph{can suppress} scenarios.
	\end{itemize}

	\item {\sc Case 2:} ($q_\omega = 0$ but $p_\omega > 0$)
		We call this the ``{\bf Popping}'' behavior because a scenario with zero probability in the nominal distribution can have a positive probability (or, pop) in the ambiguous problem. 
%In this case, we need to examine $\lim_{t \nearrow 0} \frac{\phi(t)}{t}$:
	\begin{itemize}
		\item If $\lim_{t \nearrow \infty} \frac{\phi(t)}{t} = \infty$, the ambiguity region will never contain distributions with $p_\omega > 0$ but $q_\omega = 0$.
			We say that such a $\phi$-divergence \emph{cannot pop} scenarios.

		\item If $\lim_{t \nearrow \infty} \frac{\phi(t)}{t} < \infty$, the ambiguity region can admit sufficiently small $p_\omega$.
			We say that these $\phi$-divergences \emph{can pop} scenarios.
	\end{itemize}

	\item {\sc Case 3:} ($p_\omega = 0$ but $q_\omega = 0$)
		Such a situation has no contribution to the divergence because $0 \phi\left(\tfrac{0}{0}\right) = 0$.
\end{itemize}

%\noindent
The two limiting cases describing suppression and popping behavior 
%in a $\phi$-divergence 
create four distinct classes of $\phi$-divergences.
Table~\ref{tb:phi_categories} categorizes the $\phi$-divergences listed in Table~\ref{tb:phi_definitions} (except for Likelihood).
We will shortly provide a subclassification for the $\phi$-divergences that can suppress.
We end this section with a simple proposition that relates the behavior of a $\phi$-divergence with its adjoint.  


\begin{table}
	\TABLE
	{
		Examples of $\phi$-divergences fitting into each class.
		The number in parentheses under the ``Can Suppress Scenarios'' column denotes the subclass detailed in Section \ref{ssec:suppress}.
		\label{tb:phi_categories}
	}
	{\begin{tabular}{l p{.33\textwidth}p{.33\textwidth}}
		 \hline
		 & Can Suppress Scenarios & Cannot Suppress Scenarios  \\
		 \hline 
		&   & \\  [\dimexpr-\normalbaselineskip+3pt]
		 Can Pop Scenarios %
			& \parbox{.33\textwidth}{Variation Distance (1)\\Hellinger Distance (2)} %
			& \parbox{.33\textwidth}{$\chi^2$-Distance\\Burg Entropy} \smallskip \\
		 Cannot Pop Scenarios %
			& \parbox{.33\textwidth}{Modified $\chi^2$-Distance (1)\\Kullback-Leibler Divergence (2)} %
			& \parbox{.33\textwidth}{J-Divergence}  \\ 
		&   & \\  [\dimexpr-\normalbaselineskip+3pt]
		\hline
	\end{tabular}}
	{}
\end{table}


\begin{proposition} \label{prop:adjoint}
	A $\phi$-divergence can suppress scenarios if and only if its adjoint $\tilde{\phi}$ can pop scenarios.  
\end{proposition}

\begin{proof}{\sc Proof of Proposition \ref{prop:adjoint}.}
%	The adjoint satisfies the relation 
%	\begin{equation}\label{eq:adjoint}
%	\end{equation}
%$$\tilde{\phi}(t) = t\frac{\phi(t)}{t}.$$
	Suppose $\phi$ can suppress scenarios. 
	Then, $\lim_{t \searrow 0} \phi(t) < \infty$.
	Rearranging the adjoint relationship (\ref{eq:adjoint}) as 
$\frac{\tilde{\phi}(t)}{t} = \phi\left(\frac{1}{t}\right)$, and taking limits as $t \nearrow \infty$, we see that $\lim_{t \nearrow \infty} \frac{\tilde{\phi}(t)}{t}< \infty$.  
	This means that the adjoint $\tilde{\phi}$ can pop scenarios. 
	The reverse is also true by the adjoint relationship. 
	\Halmos
\end{proof}

Proposition \ref{prop:adjoint} implies that a self-adjoint $\phi$-divergence can only appear in the top left (can suppress and can pop) or bottom right (cannot suppress and cannot pop) corners of Table~\ref{tb:phi_categories}, as can be seen by the Variation and Hellinger distances and the J-divergence. 
%self-adjoint $\phi$-divergences are either capable of both popping and suppressing scenarios at the same time or capable of neither.
%This can be seen in Table~\ref{tb:phi_categories} by the Variation and Hellinger distances, and the J-divergence. 
Proposition \ref{prop:adjoint} also implies that a $\phi$-divergence can suppress but cannot pop scenarios if and only if its adjoint $\tilde{\phi}$ can pop but cannot suppress scenarios. 
Examples of such adjoints can be seen in the opposite corners of Table~\ref{tb:phi_categories} by the pairs (Modified $\chi^2$-Distance,  $\chi^2$-Distance) and (Kullback-Leibler Divergence, Burg Entropy).  

%\bigskip 


%In particular, a self-adjoint $\phi$-divergence can only appear in the top left (can suppress and can pop) or bottom right (cannot suppress and cannot pop) corners of Table~\ref{tb:phi_categories}, as can be seen by the Variation and Hellinger distances, and the J-divergence. 

%	This means that a $\phi$-divergence cannot suppress scenarios if and only if its adjoint cannot pop scenarios and vice versa. 
%	This also means that self-adjoint $\phi$-divergences are either capable of both popping and suppressing scenarios or capable of neither.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Additional Details about $\phi$-Divergences that can Suppress: A Subclassification}
\label{ssec:suppress}


$\phi$-divergences that can suppress scenarios (i.e., $\lim_{t \searrow 0} \phi(t) = \underbar{c}$ for some $0\leq \underbar{c}< \infty$) induce a finite limit on $\phi^*(s)$ as $s \searrow -\infty$ (that is, $\lim_{s \searrow -\infty}\phi^*(s)=-\underbar{c}$). 
This can be seen from the fact that $\phi^*$ is a monotone non-decreasing function that is bounded below by $-\phi(0)=-\lim_{t \searrow 0}\phi(t)$. 
However, this limit on $\phi^*$ as  $s \searrow -\infty$ can be achieved in different ways. 
Consider, for instance, Modified $\chi^2$-distance and KL divergence. 
For both divergences,  $\lim_{t \searrow 0} \phi(t) = 1$ and $\lim_{s \searrow -\infty}\phi^*(s)=-1$. 
However, these two divergences are inherently different. 
For the Modified  $\chi^2$-distance, $\phi_{m\chi^2}^*(s)=-1$ for all $s < -2 = \lim_{t \searrow 0} \phi_{m\chi^2}^{\prime}(t)$. 
In contrast, the same limit is reached only asymptotically as $s \searrow -\infty = \lim_{t \searrow 0} \phi_{kl}^{\prime}(t)$ for the KL divergence. 
This, in turn, relates to the way the scenarios are suppressed by these divergences. 


To see how the above discussion relates to suppression, recall the primal-dual variable relation (\ref{eq:p_worst}), which specifies $\frac{p^*_\omega}{q_\omega} \in \partial \phi^*(s^*_\omega)$, where $s^*_\omega=\frac{h_{\omega}(\x^*)-\mu^*}{\lambda^*}$. 
Note that suppression ($p^*_\omega = 0$, $q_\omega > 0$) can occur only when $0 \in \partial \phi^*(s^*_\omega)$.
Recall also that, by the convex conjugate's properties, we have  $0 \in \partial \phi^*(s^*_\omega)$ if and only if $s^*_\omega \in \partial \phi(0)$. 
Assume  $\phi$ and $\phi^*$ are differentiable for convenience. 
Examining $\lim_{t \searrow 0} \phi'(t)$ reveals when $0=\phi^{* \prime} (s^*_\omega)$ by the aforementioned property of conjugates. 
This analysis yields two subclasses within the $\phi$-divergences that can suppress scenarios---one tends to suppress scenarios one at a time, and the other suppresses all but the most costly scenario(s) simultaneously.


\begin{itemize}
	\item {\sc Subclass 1\ ($\lim_{t \searrow 0} \phi'(t) > -\infty$)}  In this case, there are two constants $\underbar{c}\geq 0$, and $\underline{s}\leq 0$ such that (i)  $\lim_{t \searrow 0} \phi'(t) = \underline{s}$ and (ii) $\phi^*(s) = -\underbar{c} =  - \lim_{t \searrow 0} \phi(t)$ for all $s < \underline{s}$. 
		Thus, $\phi^{*\prime}(s_\omega^*) = 0$ when $s_\omega^* < \underline{s}$, suppressing all such scenarios.
		In other words, all scenarios $\omega$ that satisfy  $\frac{h_\omega(\x^*)-\mu^*}{\lambda^*} < \underline{s}$ are suppressed.
		As $\rho$ increases, scenarios tend to be suppressed one at a time as their $s_\omega^*$ reaches $\underline{s}$.
		Modified $\chi^2$-distance belongs to this subclass. 
		\smallskip 


	\item {\sc Subclass 2\ ($\lim_{t \searrow 0} \phi'(t) = -\infty$)} In this case, there is a constant $\underbar{c}\geq 0$ such that $\phi^*(s) \searrow -\underbar{c}=-\lim_{t \searrow 0} \phi(t)$ as $s \rightarrow -\infty$ asymptotically, but never reaches the bound.
		As a result, scenarios can only be suppressed if $s_\omega^* = -\infty$. 
Intuitively, this can only occur if $\lambda^* = 0$ and $h_\omega(\x^*) < \mu^*$.
%		(Recall the discussion after the dual formulation (\ref{eq:plp_two_stage}) that reveals $s_\omega \leq 0$.) 
		Consequently, all scenarios $\omega$ with $h_\omega(\x^*) < \mu^*$ have $p_\omega^*=0$, and we must have $\mu^* = \max_\omega h_\omega(\x^*)$ to ensure that scenarios $\omega \in \argmax h_\omega(\x^*)$ are given positive probability so that $\p^*$ is a probability vector.
		This means that all but the most expensive scenario(s) will vanish simultaneously.
		KL divergence belongs to this subclass.\smallskip 
\end{itemize}

Table \ref{tb:phi_categories} lists $\phi$-divergences that belong to these subclasses with the number in parentheses.
%%As an example of a phi-divergence in the first subclass, consider the Modified $\chi^2$-Distance. 
%%For the Modified $\chi^2$-Distance, $c=-1$ and $\underline{s}=-2=\lim_{t \searrow 0} \phi'(t)$. 
%%We will numerically show in Section~\ref{ssec:numerical_pop_suppress} that Modified  $\chi^2$-Distance suppresses scenarios one at a time as $\rho$ increases. 
%%On the other hand, Kullback-Liebler Divergence belongs to the second subclass. 
%%For this divergence, constant $c$ is reached asymptotically; $\lim_{s \rightarrow -\infty}\phi^*(s) =c=-1$. 
%Because such a constant $c$ is reached only as $s \rightarrow -\infty$, scenarios can only be suppressed if $s_\omega = -\infty$, scenarios will be suppressed simultaneously.  
Divergences in the second subclass can be difficult to deal with numerically when suppression occurs because of the $\lambda^* = 0$ in the denominator (see Section \ref{ssec:implement} for details).
In addition to numerical illustration of this subclassification in Section~\ref{ssec:numerical_pop_suppress}, we will shortly provide examples of $\phi$-divergences to illuminate the different behaviors in Section~\ref{ssec:special_phi}. 


% We present the one-by-one and simultaneous suppressing behavior numerically for the Modified $\chi^2$-Distance and KL Divergence, respectively, in Section \ref{sec:comp_results}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Additional Details about $\phi$-Divergences that can Pop}
\label{ssec:pop}

Divergences that can pop a scenario have $\phi(t)$ grow linearly as $t \rightarrow \infty$, which causes the existence of an upper bound $\bar{s} = \lim_{t \rightarrow \infty} \frac{\phi(t)}{t}$ on the domain of $\phi^*(s)$.
(Recall the discussion in Section~\ref{ssec:form} regarding \plp\ in dual form, in particular regarding constraint (\ref{eq:plp_feas_constraint}).)
The primal-dual variable relation (\ref{eq:p_worst}) specifies $\frac{p^*_\omega}{q_\omega} \in \partial \phi^*(s^*_\omega)$, but the left-hand side is undefined when $q_\omega = 0$.
Intuitively, we can think of $\frac{p^*_\omega}{0} = \infty$ if $p^*_\omega > 0$. 
Thus popping a scenario can be thought to occur when the right-hand side subdifferential also includes $\infty$.
This, in turn, can only happen at the boundary of the domain, $s^*_\omega = \bar{s}$, or when scenario $\omega$ has the highest cost.
Therefore, only the most expensive scenarios can be popped. 
The next proposition makes this statement rigorous.

\begin{proposition} \label{prop:pop}
	Suppose there is a finite $\bar{s} = \lim_{t \rightarrow \infty} \frac{\phi(t)}{t}$.
	A scenario $\bar{\omega}$ for which $q_{\bar{\omega}} = 0$ can only be popped if 
%$s^*_\omega = \bar{s}$.
it has the highest cost, i.e.,  $\bar{\omega} \in \arg\max_{\omega} h_\omega(\x^*)$. 
\end{proposition}

\begin{proof}{\sc Proof of Proposition \ref{prop:pop}.}
	We present here an abridged derivation of the dual problem (\ref{eq:plp_two_stage}), which can be found in full in \cite{bental2013robust} and additionally consider the case where $q_\omega = 0$.
	For this proof, we assume for simplicity that the first-stage cost vector is $\c = \mathbf{0}$.	
	We begin with the Lagrangian of (\ref{eq:plp_primal}), $\mathcal{L}(\p,\mu,\lambda) = \sum_{\omega=1}^n p_\omega h_\omega(\x) + \left( 1-\sum_{\omega=1}^n p_\omega \right)\mu + \left( \rho - \sum_{\omega=1}^n q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right) \right)\lambda$. 
We then generate the dual of the inner problem as
	\begin{align}
		 & \min_{\lambda \geq 0, \mu} \max_{\p \geq 0} \sum_{\omega=1}^n p_\omega h_\omega(\x) + \left( 1-\sum_{\omega=1}^n p_\omega \right)\mu + \left( \rho - \sum_{\omega=1}^n q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right) \right)\lambda \nonumber \\
		& = \min_{\lambda \geq 0, \mu} \mu + \rho\lambda + \sum_{\omega=1}^n \max_{p_\omega \geq 0} \left\{ p_\omega (h_\omega(\x) - \mu) - \lambda q_\omega \phi\left(\frac{p_\omega}{q_\omega}\right) \right\} \label{eq:pop_proof_detail_1} \\
		& =  \min_{\lambda \geq 0, \mu} \mu + \rho\lambda + \lambda \sum_{\omega=1}^n q_\omega \max_{t_\omega \geq 0} \left\{ s_\omega t_\omega - \phi(t_\omega) \right\} \label{eq:pop_proof_detail_2} \\
		& = \min_{\lambda \geq 0, \mu} \mu + \rho\lambda + \lambda \sum_{\omega=1}^n q_\omega \phi^*\left(s_\omega\right), \nonumber
	\end{align}
	where in (\ref{eq:pop_proof_detail_1}), we used the fact that the problem is separable for each scenario $\omega$, and in (\ref{eq:pop_proof_detail_2}), we assumed $q_\omega > 0$ with $t_\omega = \frac{p_\omega}{q_\omega}$.
	As mentioned earlier, the above formulation is valid when $\lambda =0$ with the following interpretations: $0\phi^*(b/0)=0$ if $b\leq 0$ and  $0\phi^*(b/0)=+\infty$ if $b > 0$.
	
	To account for the possibility that $q_\omega = 0$, and demonstrate popping behavior, equality (\ref{eq:pop_proof_detail_2}) must be slightly modified.
	Consider a term inside the summation in (\ref{eq:pop_proof_detail_1}) for scenario $\bar{\omega}$  with $q_{\bar{\omega}} = 0$:
	\begin{align}
		\max_{p_{\bar{\omega}} \geq 0} \left\{ p_{\bar{\omega}} (h_{\bar{\omega}}(\x) - \mu) - \lambda q_{\bar{\omega}}  \phi\left(\frac{p_{\bar{\omega}}}{q_{\bar{\omega}}}\right) \right\} & = \max_{p_{\bar{\omega}} \geq 0} \left\{ p_{\bar{\omega}} (h_{\bar{\omega}}(\x) - \mu) - \lambda 0  \phi\left(\frac{p_{\bar{\omega}}}{0}\right) \right\} \nonumber \\
		& = \max_{p_{\bar{\omega}} \geq 0} \left\{ p_{\bar{\omega}} \left( h_{\bar{\omega}}(\x) - \mu - \lambda \bar{s} \right) \right\}. \label{eq:pop_proof_condition}
	\end{align}
	The behavior of (\ref{eq:pop_proof_condition}) depends on the sign of $\left( h_{\bar{\omega}}(\x) - \mu - \lambda \bar{s}  \right)$ (or equivalently, relation between $s_{\bar{\omega}}$ and $\bar{s}$).
	There are three cases:
	\begin{description}
		\item[Case 1: $h_{\bar{\omega}}(\x) - \mu > \lambda \bar{s}$] selects $p_{\bar{\omega}} = \infty$, making (\ref{eq:pop_proof_condition}) unbounded. 
			Because we are trying to minimize the overall objective, this case will be eliminated. 
			In other words, this case induces the constraint $h_{\bar{\omega}}(\x) - \mu \leq \bar{s} \lambda$ given in (\ref{eq:plp_feas_constraint}). 
		\item[Case 2:  $h_{\bar{\omega}}(\x) - \mu < \lambda \bar{s}$] selects $p_{\bar{\omega}} = 0$.
		\item[Case 3: $h_{\bar{\omega}}(\x) - \mu = \lambda \bar{s}$] places no restrictions on the value of $p_{\bar{\omega}}$ because (\ref{eq:pop_proof_condition}) is identically zero, and hence allows for $p_{\bar{\omega}} > 0$ (popping). 
	\end{description}
	Because $h_{\bar{\omega}}(\x) - \mu \leq \lambda \bar{s}$ for all ${\bar{\omega}}$ by the above discussion, and $h_{\bar{\omega}}(\x) - \mu = \lambda \bar{s}$ for any popped scenarios, only the most expensive scenario can be popped.
	\Halmos
\end{proof}


Observe that the dual formulation (\ref{eq:plp_two_stage}) is still valid when $\lambda=0$ and $q_{\bar{\omega}} = 0$ with the aforementioned interpretations. 
When $\lambda>0$, we set  $q_{\bar{\omega}}\phi^*(\bar{s}) = 0 \cdot \infty = 0$ if $\phi^*(\bar{s})= \infty$ when popping occurs by using the typical extended arithmetic rules \citep[]{rockafellar_70}. 
Finding the probability of the popped scenario cannot be done by using the subgradients of $\phi^*$ as with other scenarios.
Thus, the probability must be calculated with the primal feasibility conditions as detailed in Property \ref{property:primal_dual_relation}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\subsection{Some Special $\phi$-Divergences: Illustration of Suppression and Popping via Risk Models}
\subsection{Illustration of Suppression and Popping via Risk Models}
\label{ssec:special_phi}

The class of $\phi$-divergence constrained problems includes some special cases that result in common risk models, which we document here. 
We follow each example with a discussion of their suppressing and popping behavior.
Derivations of these examples are provided in the appendix.
Below and in the appendix, we use the notation $[a]^+ = \max \{a,0\}$, and we view $\h(\x)$ as a random variable that takes on values  $h_\omega(\x)$ according to the nominal probabilities $\q$ when writing expectations, etc. 


\begin{example}[CVaR]
	\label{ex:cvar}
	The coherent risk measure Conditional Value-at-Risk (CVaR) is one of the most widely used risk measure in the literature.
	Minimizing
	\[
		\c\x + \text{CVaR}_\beta(\h(\x)) = \c\x + \min_{\mu \in \R} \left\{ \mu + \frac{1}{1-\beta}\e{\left[\h(\x)-\mu\right]^+} \right\}
	\]
	over $\x \in X$ is equivalent to the $\phi$-divergence constrained \plp\ with
	\[
		\phi_{\text{CVaR}}(t) = \
		\begin{cases}
			0 & 0 \leq t \leq \frac{1}{1-\beta} \\
			\infty & \text{otherwise},
		\end{cases}
	\]
	for $0 < \beta < 1$.
	We see that $\phi_{\text{CVaR}}(0) = 0$, indicating that CVaR will suppress some scenarios.
	This appears in the definition of CVaR as the positive part in the expected value, $\e{[\h(\x)-\mu]^+}$.
	Furthermore, suppression will occur one at a time.  
%	This one-at-a-time suppression can be seen from the definition of CVaR. 
%	As is well known, CVaR will only consider the $(1-\beta)-tail scenarios, and scenarios will be suppressed as $\beta$ is increased.  
	Scenarios cannot be popped because the expectation is taken with respect to the nominal distribution.
	This can also been seen by the limit $\lim_{t \nearrow \infty}\frac{\phi_{\text{CVaR}}(t)}{t}= \infty$. 
	\Halmos 
\end{example}


The CVaR $\phi$-divergence is bounded above, which leads to the question of what happens when a $\phi$-divergence is bounded below.
The ``reverse'' CVaR, turns out, is equivalent to minimizing a convex combination of expectation and worst-case. 
We discuss this example below. 


\begin{example}[Convex Combination of Expectation and Worst-Case]
	\label{ex:rcvar}
	The $\phi$-divergence constrained \plp\ with
	\[
		\phi_{\text{EW}}(t) = \
		\begin{cases}
			\infty & t < 1-\beta \\
			0 & t \geq 1-\beta,
		\end{cases}
	\]
	for $0 < \beta < 1$ is equivalent to minimizing the convex combination of expectation and worst-case
	\[
		\c\x + \beta \sup_\omega h_\omega(\x) + (1-\beta)\e{\h(\x)},
	\]
	over $\x \in X$, where the expectation is taken with respect to the nominal probability vector $\q$.
	Note that $\lim_{t \rightarrow \infty} \frac{\phi_{\text{EW}}(t)}{t} = 0$, indicating that this divergence will pop scenarios.
	This behavior appears in the term $\sup_\omega h_\omega(\x)$.
	However, $\phi_{\text{EW}}(0) = \infty$ indicates that scenarios will not be suppressed, which is demonstrated by the expectation term $\e{\h(\x)}$. 
	Recall that the expectation term takes into account every scenario with positive nominal probability.
	\Halmos
\end{example}


An objective function taking a weighted sum of expected value and CVaR often comes up in practice.
The next example shows how to generate a convex combination of expectation and CVaR using $\phi$-divergences.

\begin{example}[Convex Combination of Expectation and CVaR]
	\label{ex:cvar_expectation}
	The $\phi$-divergence constrained \plp\ with
	\[
		\phi_{\text{EC}}(t) = 
		\begin{cases}
			0 & 1-\alpha \leq t \leq \frac{1}{1-\beta} \\
			\infty & \text{otherwise},
		\end{cases}
	\]
	for $\alpha,\beta \in (0,1)$ is equivalent to minimizing, over $\x \in X$,
	\[
		\c\x + (1-\alpha)\e{\h(\x)} + \alpha \mbox{CVaR}_{\frac{\beta}{\alpha(1-\beta)+\beta}}[\h(\x)].
	\]
	This divergence will neither pop (because both the expectation and CVaR term are taken with respect to the nominal distribution) nor suppress (because the expectation term includes every scenario).
	\Halmos
\end{example}


% \subsection{Variation-type Divergences}
% 
% The variation divergence is given by $\phi(t) = |t-1|$, which will result in piecewise-linear $\phi^*$.
% In general, $\phi$-divergences have a left-of-one range bounded by $\phi(0)$ and a right-of-one range bounded by $\lim_{t \rightarrow \infty} \frac{\phi(t)}{t}$.
% Consequently, we can restrict $\rho \leq 1$ for variation-type divergences.
% 
% \subsubsection{Right-sided Variation}
% 
% First, we can look only at the right-side of the variation, i.e., $\phi(t) = [t-1]^+$.
% This yields
% \[
% 	\phi^*(s) = 
% 	\begin{cases}
% 		[s]^+ & s \leq 1 \\
% 		\infty & s > 1.
% 	\end{cases}
% \]
% The upper bound on $s$ yields $\frac{h_\omega(x)-\mu}{\lambda} \leq 1$ or $\lambda \geq \sup_\omega h_\omega(x) - \mu$.
% Then starting from (\ref{eq:basic_optimization}),
% \begin{align*}
% 	\min \rho\lambda + \mu + \lambda \sum_\omega q_\omega \left[ \frac{h_\omega(x) - \mu}{\lambda} \right]^+ & = \min_{\lambda \geq \sup_\omega h_\omega(x) - \mu, \mu} \rho\lambda + \mu + \sum_\omega q_\omega \left[ h_\omega(x) - \mu \right]^+ \\
% 	& = \min_\mu \rho(\sup_\omega h_\omega(x) - \mu) + \mu \e{\left[h(x)-\mu\right]^+} \\
% 	& = \rho \sup_\omega h_\omega(x) + \min_\mu (1-\rho)\mu + \e{\left[h(x)-\mu\right]^+} \\
% 	& = \rho \sup_\omega h_\omega(x) + (1-\rho) \left( \min_\mu \mu + \frac{1}{1-\rho}\e{\left[h(x)-\mu\right]^+}\right) \\
% 	& = \rho \sup_\omega h_\omega(x) + (1-\rho) \mbox{CVaR}_\rho(h(x)).
% \end{align*}
% This time we get a convex combination between the supremum and CVaR.
% 
% \subsubsection{Left-sided Variation}
% 
% The results start getting much messier here.
% The left-sided variation is $\phi(t) = [1-t]^+$, which gives
% \[
% 	\phi^*(s) = 
% 	\begin{cases}
% 		-1 & s < -1 \\
% 		s & -1 \leq s \leq 0 \\
% 		\infty & s > 0.
% 	\end{cases}
% \]
% Once again, we have the condition $\mu \geq \sup_\omega h_\omega(x)$.
% The center linear portion will induce a CVaR-like behavior for $\frac{h_\omega(x)-\mu}{\lambda} \geq -1$, or $\mu - \lambda \leq h_\omega(x)$.
% However, the opposite condition behaves only as $\lambda$.
% Let $\bar{q} = \p{h_\omega(x) \geq \mu - \lambda}$, then using equation (\ref{eq:basic_optimization}),
% \begin{align*}
% 	\min_{\lambda \geq 0,\mu} \rho\lambda + \mu + \lambda \sum_\omega q_\omega \phi^*\left(\frac{h_\omega(x) - \mu}{\lambda}\right) 
% 	& = \min_{\lambda, \mu \geq \sup_\omega h_\omega(x)} \rho\lambda + \mu - \lambda(1-\bar{q}) + \sum_{\omega : h_\omega(x) \geq \mu - \lambda} q_\omega (h_\omega(x) - \mu) \\
% 	& = \min_{\lambda, \mu \geq \sup_\omega h_\omega(x)} \rho\lambda + (1-\bar{q})(\mu-\lambda) + \sum_{\omega : h_\omega(x) \geq \mu - \lambda} q_\omega h_\omega(x) \\
% 	& = \min_{\lambda, \mu \geq \sup_\omega h_\omega(x)} \rho\lambda + (1-\bar{q})(\mu-\lambda) + \bar{q} \mbox{CVaR}_{1-\bar{q}}(h(x)).
% \end{align*}
% $\mu-\lambda$, being the quantity that defines $\bar{q}$, is like $\mbox{VaR}(h(x))$.
% I don't know that having VaR in there makes any sense, though.
% I also can't think of any other way to simplify the above.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Modeling Considerations When Choosing a $\phi$-Divergence}
\label{ssec:modeling}


We offer the following suggestions for choosing an appropriate $\phi$-divergence class for the data available and the preferences of a decision maker.

%{\it Suppressing or Not.}  
First, consider whether to choose a  $\phi$-divergence that can suppress scenarios.
If the scenarios come from high-quality observed data, one may wish to avoid $\phi$-divergences that can suppress scenarios.
Alternatively, if the decision maker wishes to consider every observed scenario with a positive probability in the final model, then $\phi$-divergences that cannot suppress would be preferable. 
However, if the data is poorly sampled or comes from opinion rather than observation or simulation, the option of suppressing scenarios may result in a solution with better robustness properties.
Suppression one at a time may be preferred by decision makers who wish to see the effect of robustness level (or risk level) on the solutions.  
%In general, suppression may result in more robust (or conservative) solutions. 
%For this, think of CVaR (which can suppress scenarios) versus convex combination of Expectation and CVaR (which cannot suppress scenarios). 

%{\it Popping or Not.} 
Next, consider whether to choose a  $\phi$-divergence that allows for popping scenarios.
If the scenarios strictly come from observation, with little theoretical understanding of the problem, we suggest choosing a $\phi$-divergence that cannot pop scenarios.
However, if the scenarios come from a mix of observed/simulated data and expert opinion about scenarios of interest, then divergences that can pop present an interesting modeling choice.
This allows for including interesting but unobserved scenarios and letting the mathematical program to assign an appropriate probability to them.
%Because only the most expensive scenarios can be popped, this could also result in more robust (or conservative) solutions. 


A risk-averse decision maker that is flexible might wish to consider $\phi$-divergences that can both pop and suppress. 
This way, the model assigns an appropriate probability in a risk-averse manner to each scenario. 
Suppression one at a time may again be preferred to see the effect of robustness level on the individual scenarios. 
%Variation distance, for example, satisfies all these conditions. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{A Decomposition-Based Solution Method}
\label{sec:soln_algorithm}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}
\label{ssec:prelim}

As the model gets larger, a direct solution of \plp\ becomes computationally expensive. 
Decomposition-based methods could significantly reduce the solution time and allow larger problems to be solved efficiently. 
We propose a Bender's decomposition-based algorithm for solving \plp\ using the dual formulation. 
%We use the dual formulation for our decomposition algorithm. 
The heart of the algorithm lies in the fact that dual \plp\ (\ref{eq:plp_two_stage}) is a convex program (Property \ref{property:convex}) that can be decomposed by scenario (Property \ref{property:time_structure}).
We begin with a proposition to make the first point (convexity) clear. 
%Decomposability by scenario has already been discussed in Property~\ref{property:time_structure}. 
The proof is provided in the appendix. 

\begin{proposition}\label{prop:convex}
$h^{\dagger}_\omega(\x, \lambda, \mu) = \lambda \phi^*\left(\frac{h_\omega(\x) - \mu}{\lambda}\right)$ is a convex function over $\lambda \geq 0$, $\mu$, and $\x \in X$. 
\end{proposition} 


By Proposition~\ref{prop:convex}, the expectation of this function with respect to the nominal distribution $\sum_{\omega=1}^{n} q_\omega h^{\dagger}_\omega(\x, \lambda, \mu)$ is also convex.
The algorithm replaces this convex function with a number of affine cutting planes, forming a lower approximation. 
It is possible to use a single-cut or multicut version of the algorithm. 
The single-cut version replaces the overall convex function with a number of affine cutting planes, whereas the multicut version creates affine cutting planes for each individual function $h^{\dagger}_\omega(\x, \lambda, \mu)$, $\omega = 1,\ldots,n$. 
The algorithm also removes the dual feasibility constraint (\ref{eq:plp_feas_constraint})---when it is present in the formulation---and exchanges it with a series of feasibility cuts.
Recall that feasibility constraint (\ref{eq:plp_feas_constraint}) is present only for $\phi$-divergences that can pop scenarios. 
  

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Algorithm}
\label{ssec:algo}

Algorithm 1 presents a basic implementation of the proposed method. 
The modified Bender's decomposition presented here has the following features:
\begin{inparaenum}[\itshape (i\upshape)]
	\item it solves the original linear second-stage subproblems (\ref{eq:slp_second_stage}) rather than nonlinear subproblems in (\ref{eq:dec}), and it uses them to quickly generate subgradients of the nonlinear term $h^{\dagger}_\omega(\x, \lambda, \mu)$; 
	\item when $\lim_{t \rightarrow \infty}\frac{\phi(t)}{t}=\bar{s}<\infty$, it exchanges the nonlinear constraints $h_\omega(\x) - \mu \leq \bar{s} \lambda$ for a (potentially much) smaller set of easily generated affine feasibility cuts; and thus
	\item  it maintains a linear master problem and linear subproblems.
\end{inparaenum}


The single-cut master problem is given by
\begin{align}
	\min_{\x,\lambda,\mu} \ & \c\x + \mu + \rho \lambda + \theta \label{eq:master_problem}\\
	\st \ & \x \in X,\ \ \lambda \geq 0 \nonumber  \\
	& \theta \geq \T_j \cdot (\x\ \lambda\ \mu)^T + t_j, \ \  j \in J  \label{eq:objcut} \\
	& \mu + \bar{s}\lambda \geq \M_k \cdot \x + m_k, \ \ k \in K \label{eq:feascut} 
\end{align}
where constraints \eqref{eq:objcut} are the objective cuts, constraints  (\ref{eq:feascut}) are the feasibility cuts replacing constraint (\ref{eq:plp_feas_constraint}) if $\lim_{t \rightarrow \infty} \frac{\phi(t)}{t} =\bar{s} < \infty$, and $J$ and $K$ are the index sets for objective and feasibility cuts, respectively.
In the algorithmic statement and discussions below, any variable with a `hat' over it (e.g., $\xh, \mh$) indicates a current solution.
In contrast, regular variables (e.g., $\x, \mu$) appear in the definition of master problem, cutting planes (\ref{eq:objcut}) and (\ref{eq:feascut}), etc. 
\medskip


%\begin{figure}		
%\label{fig:algorithm}
%\FIGURE
\begin{center}
\begin{minipage}{.7\textwidth}
	\TableSpaced
	\hrule
	\vspace{1pt}
    {\bf \small Algorithm 1.\ \ Decomposition algorithm for solving \plp}
	\hrule
	\vspace{1pt}
	\begin{algorithmic}[1]
		\State Initialize $z_l = -\infty, z_u = +\infty$; $J \gets \emptyset, K \gets \emptyset$; select $\texttt{TOL}\geq 0$
		\State Initialize $\hat{\lambda} \gets 1$, $\hat{\mu} \gets 0$, and $\hat{\theta}\gets 0$  \label{l:init}
		\While{$z_u - z_l \geq \texttt{TOL}\cdot \min\{|z_u|,|z_l|\}$}
			\If{at first iteration}
				\State Solve master problem (\ref{eq:master_problem}) with $\theta=\mu = 0$ to generate $\xh$
			\Else 
				\State Solve master problem (\ref{eq:master_problem}) to obtain $\xh$, $\hat{\lambda}$, $\hat{\mu}$, and $\hat{\theta}$
			\EndIf
			\State Solve subproblems (\ref{eq:slp_second_stage}) to obtain  $h_\omega(\xh)$ and duals $\bpih_\omega$, $\omega = 1, \ldots, n$
%%			\State Set $\theta_{\text{true}} \gets \sum_{\omega=1}^n q_\omega h^{\dagger}_\omega(\xh,\hat{\lambda},\hat{\mu})$
			\If{$\bar{s}< \infty$ and $\sup_\omega h_\omega(\xh) - \hat{\mu} > \bar{s}\hat{\lambda}$}
				\State Generate feasibility cut and add to master problem; update $K$
				\State Find $\hat{\mu}$ so that $\sup_\omega h_\omega(\xh) - \hat{\mu} < \bar{s}\hat{\lambda}$ \label{l:mu}
			\Else
				\State Set $z_l \gets$ master optimal cost $\c\xh + \hat{\mu} + \rho \hat{\lambda} + \hat{\theta}$
			\EndIf
			\State Generate objective cut and add to master problem; update $J$
			\State Set $\theta_{\text{true}} \gets \sum_{\omega=1}^n q_\omega h^{\dagger}_\omega(\xh,\hat{\lambda},\hat{\mu})$
			\If{$\c\xh + \hat{\mu} + \rho \hat{\lambda} + \theta_{\text{true}} < z_u$} \label{l:begin}
				\State $z_u \gets \c\xh + \hat{\mu} + \rho \hat{\lambda} + \theta_{\text{true}}$
				\State $\x_\text{best} \gets \xh, \lambda_\text{best} \gets \hat{\lambda}, \mu_\text{best} \gets \hat{\mu}$
				\State $p_\omega \gets \phi^{*\prime}(\tfrac{h_\omega(\x) - \mu}{\lambda}) q_\omega $ for $\omega = 1, \dots, n$  \label{l:end}
			\EndIf
		\EndWhile
	\end{algorithmic}
	\hrule
\end{minipage}
\end{center}
%{}
%{}
%\end{figure}
\medskip 


In line \ref{l:init} of Algorithm 1, we initialize $\hat{\lambda} = 1$ somewhat arbitrarily. 
However, by setting $\hat{\theta}=\hat{\mu}=0$, we obtain a candidate solution $\xh$ in the first iteration by essentially solving $\min\{\c\x: \x \in X\}$. 
It is possible to obtain an initial $\xh$ in other ways. 
If we detect that current $\hat{\mu}$ is infeasible, we do not update the lower bound $z_l$ so that the master is solved again with the feasibility cuts. 
It is easy to obtain a new $\hat{\mu}$ in line \ref{l:mu} of the Algorithm.
For example, when $\bar{s}\hat{\lambda}>0$, simply setting  $\hat{\mu} = \sup_\omega h_\omega (\xh) -\bar{s}\hat{\lambda} (1-10^3)$ gives us a feasible---but not necessarily optimal---solution. 
We can then update the upper bound and current solution in lines \ref{l:begin}--\ref{l:end} as necessary.
We now go through in more detail how to obtain the objective and feasibility cuts. 


\subsubsection{Objective Cuts}

%The cut coefficients of the objective cuts are obtained through the chain rule to translate the (sub)gradients of $h_\omega(\x)$ to (sub)gradients of $h^{\dagger}_\omega(\x,\lambda,\mu)$. 
The chain rule translates the (sub)gradients of $h_\omega(\x)$ to (sub)gradients of $h^{\dagger}_\omega(\x,\lambda,\mu)$. 
Recall notation $s_\omega = \frac{h(\x)-\mu}{\lambda}$ and let $h^\dagger_\omega(s_\omega) = \lambda \phi^*\left(s_\omega \right)$ denote the nonlinear portion of the objective function. 
Consider for simplicity $\hat{\lambda} > 0$ and $\phi^*$ is differentiable.
The cut coefficients of $\x$ are formed through $\partial h^\dagger / \partial \x = (\partial h^\dagger_\omega/\partial s_\omega) \cdot (\partial s_\omega/\partial \x)$. 
This gives $\phi^{* \prime}(\hat{s}_\omega)\cdot(\bpih_\omega \B^\omega)$ as the coefficients of $\x$, where $\B^\omega$ is the technology matrix of subproblem (\ref{eq:slp_second_stage}). 
The cut coefficients of $\lambda$ and $\mu$ are found in a similar way. 
Using the (sub)gradient inequality, we can also obtain the intercept term $t_j^\omega$. 
This gives us:
%%%An objective cut can be computed by solving the SLP-2 subproblems $h_\omega(\xh)$ to obtain the optimal dual solutions $\pi^{*,\omega}$ to each second-stage problem. Using these to compute the partial (sub)derivatives of the \plp\ subproblems
% \begin{align*}
% 	\dfrac{\partial h^\dagger_\omega(\hat{s}_\omega)}{\partial \x} & = \phi^{*\prime}(\hat{s}_\omega) \pi^{*,\omega}B^\omega, &
% 	\dfrac{\partial h^\dagger_\omega(\hat{s}_\omega)}{\partial \mu} & = -\phi^{*\prime}(\hat{s}_\omega), &
% 	\dfrac{\partial h^\dagger_\omega(\hat{s}_\omega)}{\partial \lambda} & = \phi^*(\hat{s}_\omega) - \phi^{*\prime}(\hat{s}_\omega)\hat{s}_\omega.
% \end{align*}
% The cuts are then given by
\begin{align*}
	\T_j^\omega & = 
	\left( \begin{array}{ccc}
		\phi^{*\prime}(\hat{s}_\omega) \cdot (\bpih_\omega \B^\omega) \ \ \ \  
			 & \phi^*(\hat{s}_\omega) - \phi^{*\prime}(\hat{s}_\omega) \hat{s}_\omega \ \ \ \ 
			 & -\phi^{*\prime}(\hat{s}_\omega)
	\end{array} \right), \\
	t_j^\omega & =  \phi^{*\prime}(\hat{s}_\omega)\left[ h_\omega(\xh) - \bpih_\omega \B^\omega \xh \right].
%	t_j^\omega & = \lh \phi^{*\prime}(\hat{s}_\omega)\left[\hat{s}_\omega - \frac{\pi^{*,\omega}B^\omega\xh - \mh}{\lh}\right].
\end{align*}
For the single-cut master problem proposed, the vector of cut coefficients and intercept in constraints (\ref{eq:objcut}) can be found by
 $\T_j = \sum_{\omega} q_\omega \T_j^\omega$ and $t_j = \sum_\omega q_\omega t_j^\omega$. 
The multi-cut version replaces the last term in the objective of (\ref{eq:master_problem}) with $\sum_\omega q_\omega \theta_\omega$ and uses the individual cuts $\theta_\omega \geq \T_j^\omega \cdot (\x\  \lambda\ \mu)^{T} + t_j^\omega$ for each scenario $\omega = 1, \ldots, n$. 


\subsubsection{Feasibility Cuts}

After the linear subproblems (\ref{eq:slp_second_stage}) are solved, it may be the case that $h_\omega(\xh) - \mh > \bar{s} \lh$ for some scenario $\omega$, rendering $\mh$ and $\lh$ infeasible.
We need $h_\omega(\x) - \mu - \bar{s}\lambda \leq 0$ for feasibility. 
This is a convex function over $\lambda \geq 0$, $\mu$ and $\x \in X$, and we can use the lower approximation of this function to generate a feasibility cut. 
The cut coefficients are obtained through the subgradients $(\hat{\bpi}_\omega \B^\omega \ \ -1 \ \ -\bar{s} )$ for $(\x\ \ \mu\ \ \lambda)$.  
This leads to the feasibility constraints (\ref{eq:feascut}) with $\M_k = \bpih_\omega \B^\omega$ and $m_k =\bpih_\omega \d^\omega$, where $\B^\omega$ and $\d^\omega$ are from the subproblem (\ref{eq:slp_second_stage}).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Computational Considerations}
\label{ssec:computcons}

\subsubsection{Trust Regions}

In order to enhance the performance of the above decomposition algorithm, we included an $L^\infty$-norm trust region which is scaled up (by a factor of $3$) or down (by a factor of $\tfrac{1}{4}$) when the trust region inhibits finding the optimal solution or when the polyhedral lower approximation is far from the second-stage expected cost, respectively.
The trust region is an implementation of Algorithm 4.1 of \cite{nocedal1999numerical}.
This speeds up the algorithm considerably. 

\subsubsection{Implementation Notes on Different $\phi$-Divergences}
\label{ssec:implement}

First, when constraint (\ref{eq:plp_feas_constraint}) is not present, there is no need to use feasibility cuts. 
%Let's now look at implementation issues with respect to suppression and popping. 

We recommend forcing $\lambda$ to be nonzero and checking optimality condition at $\lambda = 0$ separately.
Especially, divergences that can simultaneously suppress all but the most expensive scenarios in Subclass 2 (see Section \ref{ssec:suppress}) can be computationally difficult to work with because $\lambda = 0$ could occur.
Floating point finite tolerance can alleviate this somewhat for the KL divergence, for which $\phi^*(s) = e^s$, because $e^{-800} = 0$ to machine precision.


Divergences that can pop require a check for any $s_\omega = \bar{s}$.
The probability of a popped scenario can be determined by enforcing $\sum_\omega p_\omega = 1$ after determining the probability of the other scenarios.
%
For divergences that cannot pop, it can be useful to add a computational upper bound on $s$, $\bar{s}_\text{comp}$.
Such an upper bound can be computed easily by bounding the ratio $\frac{p_\omega}{q_\omega} \leq \frac{1}{\min_\omega q_\omega}$.
The computational upper bound can then be selected so that $\phi^{*\prime}(\bar{s}_\text{comp}) \geq \frac{1}{\min_\omega q_\omega}$ according to machine precision.
Note, however, that an artificial upper bound will induce artificial popping behavior if the nominal distribution contains impossible scenarios.
This technique is especially useful for the KL divergence because $e^s$ overflows on double-precision machines for $s \geq 710$.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Numerical Illustration} 
\label{sec:comp_results}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Experimental Setup}
\label{ssec:csetup}

To illustrate the techniques discussed in this paper, we applied the specialized Bender's decomposition algorithm from Section \ref{sec:soln_algorithm} to a small electricity generation problem.
The algorithm was implemented in MATLAB using the CPLEX linear program solver. 

We modified an SLP-2 test problem, denoted APL1P, which has 5 independent random variables and 1280 realizations \citep{infanger1992monte}.
The first-stage determines the capacity to be built for two electricity generators.
The generators are operated under uncertain demands and reliability of the generators in the second stage.
To clearly demonstrate how the worst-case distribution changes with $\rho$---and especially to demonstrate suppressing and popping behavior and value of data---we took 6 unique scenarios from APL1P. 
We denote the resulting problem as $\phi$APL1P.
 

To make our presentation clear, we ordered the scenarios of $\phi$APL1P from most costly to least costly. 
So, \texttt{scen1} is the most costly scenario  (displayed in dark green in Figures \ref{fig:suppress} and \ref{fig:pop} that appears typically at the top), and  \texttt{scen6} is the least costly scenario (displayed in light green in Figures \ref{fig:suppress} and \ref{fig:pop} that appears typically at the very bottom). 
We assume each scenario is equally likely in the nominal distribution. 
In the case where we wish to investigate the popping behavior, we set the nominal probability of the most expensive scenario to $q_{\texttt{scen1}} = 0$ and again have the other scenarios be equally likely.  


For our numerical experiments, we used the following $\phi$-divergences: (i) Modified $\chi^2$-distance, (ii) KL divergence and (iii) Burg entropy.  
Modified $\chi^2$-distance and KL divergence are used to demonstrate the one-at-a-time and simultaneous suppression, respectively. 
Burg entropy, on the other hand, is used to demonstrate the popping behavior. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Results}
\label{ssec:numerical_results}


\subsubsection{Optimal Value and Solution}
\label{ssec:opt}

Table \ref{tb:numerical_results} shows the optimal value and solution of $\phi$APL1P using different $\phi$-divergences.  
As mentioned above, we assume one observation per scenario for the first three divergences in Table \ref{tb:numerical_results} and the most costly scenario unobserved in the last row to demonstrate popping.
The value of $\rho$ is chosen in accordance with an asymptotic $95\%$ confidence region in (\ref{eq:asymptotic_rho}).
For the popping example, while $n=6$ is the same, $N$ is one less, resulting in a different $\rho$. 

All divergences put the highest probability in the most costly scenario. 
At this level of robustness, the Modified $\chi^2$-distance has suppressed three scenarios, while the KL divergence has not yet suppressed any.
In the next section, as $\rho$ increases, the KL divergence will suppress all but the most costly scenario simultaneously. 
The total costs are similar except for the Burg entropy with popping, which is slightly lower.


\begin{table}
	\TABLE
	{
		Numerical results of $\phi$APL1P for various divergences.
		\label{tb:numerical_results}
	}
	{\begin{tabular}{ccccccccc}
		\hline
		       &  		  & 				 & \multicolumn{6}{c}{$p_{\omega}^*$} \\
		\cline{4-9}
		$\phi$ & $\rho$  & Opt.\ Cost  & \texttt{scen1} & \texttt{scen2} & \texttt{scen3} & \texttt{scen4} & \texttt{scen5} & \texttt{scen6} \\
 		\hline
		$\phi_{m\chi^2}$ &  1.845  & 30735 &  0.6354 & 0.2293 & 0.1353 & 0      & 0      & 0 \\
		$\phi_{kl}$      &  0.9225 & 30921 &  0.7208 & 0.1507 & 0.1050 & 0.0108 & 0.0075 & 0.0052 \\
		$\phi_b$         &  0.9225 & 30714 & 0.7751 & 0.0768 & 0.0636 & 0.0308 & 0.0285 & 0.0253 \\
		\hline
		$\phi_b$         &  1.107  & 29775 & 0.6273 & 0.1311 & 0.1065  & 0.0494 & 0.0455 & 0.0402 \\
		\hline
	\end{tabular}}
	{}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Numerical Illustrations of Suppressing and Popping}
\label{ssec:numerical_pop_suppress}

%DAVID: I'm removing this first paragraph as it is repeating stuff.

%Section \ref{sec:classification} discussed a method of classifying $\phi$-divergences into four types based on two criteria: whether a $\phi$-divergences allows distributions that suppress certain scenarios (i.e., allows $p_\omega = 0$ while $q_\omega > 0$); and whether a $\phi$-divergence allows distributions that pop a scenario that is impossible in the nominal distribution (i.e., allows $p_\omega > 0$ while $q_\omega = 0$).
%In this section we present computational results demonstrating the suppressing and popping behavior of different divergences.
%The categorization of several $\phi$-divergences can be found in Table \ref{tb:phi_categories}.


Figure \ref{fig:suppress} shows how $p^*_\omega$ changes with $\rho$ for both the Modified $\chi^2$-distance (left) and the KL divergence (right).
As shown in Section \ref{ssec:suppress}, the Modified $\chi^2$-Distance suppresses scenarios one at a time, starting with the least expensive; while the KL divergence suppresses all scenarios but the most costly scenario simultaneously at a high enough value of $\rho$.

An example of a $\phi$-divergence that can pop, the Burg entropy, is given in Figure \ref{fig:pop}.
The left plot in Figure \ref{fig:pop} demonstrates the worst-case distribution assuming that all scenarios have a single observation.
The right plot shows the worst-case distribution when all scenarios but the most costly have a single observation, which is unobserved.
Notice, in particular, that the probability of the most costly scenario becomes small as $\rho$ decreases.
Other divergences that can pop but not suppress look qualitatively similar.

\begin{figure}[htb]
	\FIGURE
	{%
		\includegraphics*[width=.40\textwidth]{images/mchi2}%
		\includegraphics*[width=.40\textwidth]{images/kl}%
		\includegraphics*[width=.12\textwidth]{images/legend1.pdf}
	}
	{
		Examples of distributions that can suppress: Modified $\chi^2$ distance (left; one-at-a-time suppression) and KL Divergence (right; simultaneous suppression).
		%Notice that the Modified $\chi^2$ distance suppresses scenarios one at a time, while the Kullback-Leibler Divergence suppresses all lower-cost scenarios simultaneously.
		Note the $x$-axis scale difference in the plots.
		\label{fig:suppress}
	}
	{}
\end{figure}


\begin{figure}[h]
	\FIGURE
	{%
		\includegraphics*[width=.40\textwidth]{images/burg}%
		\includegraphics*[width=.40\textwidth]{images/burg_zero}%
		\includegraphics*[width=.12\textwidth]{images/legend1.pdf}
	}
	{
		Example of a distribution that can pop---the Burg entropy: 
		all scenarios have a single observation (left); 
		 the most costly scenario having no observation (right).
		\label{fig:pop}
	}
	{}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Numerical Illustration of Value of Data}
\label{ssec:numerical_value_of_data}

We tested the value of data (VoD) condition from Corollary \ref{cor:cost_decrease_trick} for the Modified $\chi^2$-distance and the Burg entropy for various values of $\rho$. 
Note that VoD condition in (\ref{eq:cost_decrease_cond}) and the simplified conditions in Corollary \ref{cor:cost_decrease_trick} are sufficient but not necessary conditions.

Figure \ref{fig:value} compares the VoD condition from Corollary \ref{cor:cost_decrease_trick} to the actual cost decrease resulting from an additional observation for the Modified $\chi^2$-distance (left) and Burg entropy (right). 
The solid lines indicate when the VoD condition is satisfied, while dotted lines show when an additional observation decreased the optimal value although VoD condition was not satisfied.
VoD conditions detect the actual cost decrease in majority of the cases for this example. 


We see a reversal of the scenarios in Figure \ref{fig:value} compared to Figures \ref{fig:suppress} and \ref{fig:pop}. 
The least costly scenarios (\texttt{scen6}, \texttt{scen5}, and \texttt{scen4}) cause the largest decrease in the optimal cost if sampled one more observation.
This is expected because having one more observation from the low-cost scenarios increases our belief (or, nominal probability) that future can be less costly.  
Higher cost scenarios (\texttt{scen3} and \texttt{scen2})  cause a smaller decrease in the optimal value, and the VoD condition is less effective in detecting these scenarios, especially at lower values of $\rho$. 
The most costly scenario (\texttt{scen1}) is not visible in Figure \ref{fig:value} because an additional observation increases the optimal value.
VoD conditions are never satisfied for this scenario.


\begin{figure}
	\FIGURE
	{%
		\includegraphics*[width=.40\textwidth]{images/mchi2_decrease_condition}%
		\includegraphics*[width=.40\textwidth]{images/burg_decrease_condition}%
		\includegraphics*[width=.12\textwidth]{images/legend2.pdf}
	}
	{
		Percentage decrease in the worst-case expected cost from an additional observation for Modified $\chi^2$-distance (left) and Burg entropy (right).
%		Solid lines indicate when VoD condition is satisfied, while dotted lines show when an additional observation decrease the worst-case expected cost although VoD condition is not satisfied.
		The solid lines indicate regions where the condition is satisfied, and dotted regions indicate that the condition is not satisfied.
		\label{fig:value}
	}
	{}
\end{figure}

\subsubsection{Running Time of the Decomposition Algorithm}
\label{ssec:run_time}

\begin{figure}[h]
	\FIGURE
	{%
		\includegraphics*[width=.40\textwidth]{images/runtime}
	}
	{
		Solution time of $\phi$APL1P with the proposed decomposition algorithm against the number of scenarios for the Modified $\chi^2$-distance.
		\label{fig:runtime}
	}
	{}
\end{figure}

Finally, to illustrate the linear dependence of the problem complexity on the number of scenarios, we tracked the time to solve $\phi$APL1P (in seconds) for an increasing number of scenarios, from 6 to 200. 
Figure \ref{fig:runtime} depicts the linear increase in the running time with the number of scenarios in \plp\ formed using the Modified $\chi^2$-distance. 
Other $\phi$-divergences have similar behavior. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Summary and Future Work}
\label{sec:plp_conclusions}

We proposed to use $\phi$-divergences
% constrained distributionally robust optimization method of \citep{bental2011robust} for two-stage stochastic linear programs with recourse, creating a two-stage $\phi$-divergence constrained ambiguous stochastic linear program with recourse, denoted \plp.
%The \plp\ uses a $\phi$-divergence 
to define an ambiguity set of probability distributions---possibly using observed data, simulated data, forecasts, expert opinions, etc.---and optimize the worst-case expected cost with respect to this ambiguity set in a two-stage setting.
We provided a new classification of $\phi$-divergences that can be used in determining which $\phi$-divergence is most appropriate in practice for different model types and decision makers.
A computationally simple method is established to determine if an additional sample %will produce a likelihood ambiguity set that does not contain the current worst-case distribution and
will result in a lower-cost solution.
We have shown that as more data is gathered, the optimal value and solution of \plp\ converge to those of SLP-2. 
We have also provided a Bender's decomposition-based solution algorithm to solve \plp\ efficiently and used it to illustrate some of the properties of the \plp.
% on a small electricity generation problem.

There are many interesting avenues for future work. 
Extensions to multistage problems and ways to handle continuous distributions \plp\ merit further research, and some work has already started to appear in these areas; e.g., \cite{hukullback,jiang2015variation}. 
There are other divergences, probability metrics, and statistical ways to measure the distance between two distributions. 
Generalizations of the classification presented in this paper to other distance measures is another area of future research. 
Further refinement of the classification would be valuable to deepen our understanding of why each ambiguity set might be preferred. 
Finally, while some applications have appeared in the literature---some for specific phi-divergences, e.g., \cite{calafiore2007ambiguous,klabjan2013robust} and some recent work for general phi-divergences, e.g., \cite{love_bayraksan_w_15}---further applications of this class of problems to real-world problems would be beneficial. 

%%
\theendnotes


% Acknowledgments here
 \ACKNOWLEDGMENT{%
This work has been partially supported by the National Science Foundation through grant CMMI-1345626. 
We also thank Hamed Rahimian for comments on an earlier version of the paper.
%We also gratefully acknowledge support provided by a Water Sustainability Program Fellowship through the Technology and Research Initiative Fund at the University of Arizona.
 }% Leave this (end of acknowledgment)


% Appendix here
% Options are (1) APPENDIX (with or without general title) or 
%             (2) APPENDICES (if it has more than one unrelated sections)
% Outcomment the appropriate case if necessary
%
 \begin{APPENDICES}

	\section{Proof of Property~\ref{property:coherent_risk_measure} and Proposition~\ref{prop:convex}}
	%{Select Properties and Propositions}
	\label{sec:apx_proof} 


\begin{proof}{\sc Proof of Property \ref{property:coherent_risk_measure}}
	
A coherent risk measure has a dual representation that can be viewed as worst-case expectation from a set of probability measures. 
	Specifically, $\cal R$ is a real-valued coherent risk measure if and only if there exists a convex, bounded, and closed set $\mathcal{U}$ such that ${\cal R}(Y)=\sup_{\mathbf{\zeta} \in \mathcal{U}}\sum_{\omega=1}^n r_\omega \zeta_\omega Y_\omega$. 
	Here, random variables $Y$ are seen as functions in an $\mathcal{L}^p$ space with sample space $\Omega$ and a reference probability measure $\mathbf{r}$ (see, e.g., \cite{rockafellar2007coherent}, Theorem 6.6 of \cite{shaDR:09} for details). 
	In our context, $Y \equiv \h(\x)$, taking values $h_\omega(\x)$. 
	Observe that with our assumptions, $\sup_{\omega, \x \in X} |h_\omega(\x)|<C$ for some $C<\infty$. 
		Because we have finitely many elements of $\omega$, the choice of norm or $\mathcal{L}^p$ space does not matter; so, $\mathcal{U} \subset \{\mathbf{\zeta} \in \mathbb{R}^n\colon\ \sum_{\omega=1}^{n} r_\omega \zeta_\omega = 1, \ \  \mathbf{\zeta} \geq 0 \}$.
	To handle the case $q_\omega=0$ for some $\omega$, we use the discrete uniform distribution as the reference distribution;  $r_\omega = \frac{1}{n}, \forall \omega$. 
	Then, we rewrite  \plp\ in primal form (\ref{eq:plp_primal}) with the change of variables $\tilde{p}_\omega = \frac{p_\omega}{1/n}$ and $\tilde{q}_\omega = \frac{q_\omega}{1/n}$. 
	By setting $\zeta_\omega = \tilde{p}_\omega$, we obtain $\mathcal{U}=\left\{ \tilde{\p}\in \mathbb{R}^n\colon \  \sum_{\omega=1}^{n} \frac{1}{n}\tilde{q}_\omega \phi\left( \frac{\tilde{p}_\omega}{\tilde{q}_\omega} \right) \leq \rho, \  \sum_{\omega=1}^{n}\tilde{p}_\omega =1, \ \ \tilde{\p} \geq 0 \right\}$---a nonempty convex compact set---again with the interpretations $0\phi\left(\frac{a}{0}\right)=a\lim_{t\rightarrow \infty}\frac{\phi(t)}{t}$ for $a>0$ and $0\phi\left(\frac{0}{0}\right)=0$ for $a=0$.
	Finally, we can rewrite the inner maximization of (\ref{eq:plp_primal}) as ${\cal R}(\h(\x))=\sup_{\tilde{\p} \in \mathcal{U}}\sum_{\omega=1}^n \frac{1}{n} \tilde{p}_\omega h_\omega(\x) =\max_{\p \in {\mathcal{P}}} p_\omega h_\omega(\x)$. 
	Thus we see that \plp\ minimizes a coherent risk measure.
	\Halmos
\end{proof}

\begin{remark}
	The above proof can be simplified by using $\tilde{p}_\omega = \frac{p_\omega}{q_\omega}$ if $q_\omega > 0$ for all $\omega$.
	However, the case of $q_\omega = 0$ plays an important role in the classification presented in Section \ref{sec:classification}.
\end{remark}


\begin{proof}{\sc Proof of Proposition \ref{prop:convex}.}
	The conjugate $\phi^*$ is a nondecreasing convex function, and $h_\omega(\x)-\mu$ is a convex function over $\x \in X$ and $\mu$. 
	Therefore, their composition  $\phi^*\left(h_\omega(\x)-\mu\right)$ is convex. 
	We consider $\lambda>0$ and $\lambda =0$ cases separately. 
%	First, let $\lambda_1, \lambda_2 \geq 0$ when considering the convexity definition.   
	Because the perspective of a convex function is convex, $\lambda\phi^*\left(\frac{h_\omega(\x)-\mu}{\lambda}\right)$ is convex over $\lambda>0$,  $\x \in X$ and $\mu$. 
%	Next, let  $\lambda_1= \lambda_2 = 0$ when considering the convexity definition.   
	When $\lambda = 0$, by definition, $0\phi^*\left(\frac{b}{0}\right)= 0$ for $b \leq 0$, $0\phi^*\left(\frac{b}{0}\right)= +\infty$ for $b>0$.  
	This is a nondecreasing convex function; so by the same arguments, we obtain that  $0\phi^*\left(\frac{h_\omega(\x)-\mu}{0}\right)$ is convex over $\lambda=0, \mu$ and $\x \in X$. 
	Consider $\x_1 \in X$, $\x_2 \in X$, $\mu_1,\mu_2$, and cases (i) $\lambda_1=\lambda_2 = 0$, (ii) $\lambda_1= 0$, $\lambda_2>0$, (iii) $\lambda_1 >0$, $\lambda_2 =0$, and (iv) both $\lambda_1, \lambda_2>0$.  
	Let $a \in [0,1]$ and define $\bar{\x}=a \x_1 + (1-a) \x_2$, $\bar{\mu}=a\mu_1 + (1-a)\mu_2$, and $\bar{\lambda}=a \lambda_1 + (1-a)\lambda_2$. 
	Then, using the above definitions and convexity arguments, one can show that for all $a \in [0,1]$, $\bar{\lambda} \phi^*\left(\frac{h_\omega(\bar{\x}) - \bar{\mu}}{\bar{\lambda}}\right) \leq a \lambda_1 \phi^*\left(\frac{h_\omega(\x_1) - \mu_1}{\lambda_1}\right) + (1-a) \lambda_2 \phi^*\left(\frac{h_\omega(\x_2) - \mu_2}{\lambda_2}\right)$.
	\Halmos
\end{proof}


	\section{Derivations of $\phi$-Divergences in Section \ref{ssec:special_phi}}

\begin{proof}{\sc Proof of Example \ref{ex:cvar} (CVaR).}
%	First, note that for a random cost $h_\omega(x)$ we have $\mbox{CVaR}_\beta(h(x)) = \sup_{P \in \mathcal{P}} \{\e{PH(x)}\}$, where the risk envelope is given by
%	\[
%		\mathcal{P} = \left\{ P \left| P \leq \frac{1}{1-\beta}, \e{P} = 1, P \geq 0 \right.\right\}.
%	\]
%	This risk envelope motivates the choice of $\phi(t)$.
%	
	Note that $\phi_{\text{CVaR}}$ only admits two distance values: $0$ or $\infty$.
	Thus any choice of $\rho < \infty$ is equivalent to $\rho = 0$.
	The conjugate of $\phi_{\text{CVaR}}$ is
	\[
	\phi_{\text{CVaR}}^*(s) = \
		\begin{cases}
			0 & s < 0 \\
			\frac{1}{1-\beta} s & s \geq 0,
		\end{cases}
	\]
	or equivalently $\phi_{\text{CVaR}}^*(s) = \max\left\{0, \frac{1}{1-\beta} s \right\}$. 
	This results in the dual problem 
	\begin{align*}
		\min_{\lambda \geq 0,\mu} \mu + \rho \lambda + \lambda \sum_\omega q_\omega \phi_{\text{CVaR}}^*\left( \frac{h_\omega(\x)-\mu}{\lambda} \right) & = \min_{\lambda \geq 0, \mu} \mu + 0\lambda 
		 + \lambda \sum_\omega q_\omega \max\left\{ 0, \frac{1}{1-\beta} \frac{h_\omega(\x)-\mu}{\lambda} \right\} \\
		& = \min_{\mu} \mu + \frac{1}{1-\beta} \sum_\omega q_\omega \max\left\{ 0, h_\omega(\x)-\mu \right\} \\
		& = \min_{\mu} \mu + \frac{1}{1-\beta} \e{[\h(\x)-\mu]^+},
	\end{align*}
	which is one definition of $\text{CVaR}_\beta(\h(\x))$. 
	Observe that selecting $\lambda=0$ in the above problem results in the objective $\sup_\omega h_\omega(\x)$. 
	Because $\text{CVaR}_\beta(h(\x)) \leq \sup_\omega h_\omega(\x)$, it is optimal to pick $\lambda >0$. 
	\Halmos
\end{proof}


\begin{proof}{\sc Proof of Example \ref{ex:rcvar} (Convex Combination of Expectation and Worst-Case).}
	Without loss of generality, let $\rho = 0$.
	The conjugate of $\phi_{\text{EW}}$ is
	\[
		\phi_{\text{EW}}^*(s) =
		\begin{cases}
			(1-\beta) s & s \leq 0 \\
			\infty & s > 0.
		\end{cases}
	\]
	This gives the dual problem
	\begin{align*}
		\min_{\lambda \geq 0,\mu} \mu + \rho \lambda + \lambda \sum_\omega q_\omega \phi_{\text{EW}}^*\left(\frac{h_\omega(\x)-\mu}{\lambda}\right) & = \min_{\mu \geq \sup_\omega h_\omega(\x)} \mu + \sum_\omega q_\omega (1-\beta) (h_\omega(\x) - \mu) \\
		& = \min_{\mu \geq \sup_\omega h_\omega(\x)} \beta \mu + (1-\beta) \sum_\omega q_\omega h_\omega(\x) \\
		& = \beta \sup_\omega h_\omega(\x) + (1-\beta) \e{\h(\x)}. \halmos
	\end{align*}
\end{proof}


\begin{proof}{\sc Proof of Example \ref{ex:cvar_expectation} (Convex Combination of Expectation and CVaR).}
	Without loss of generality, let $\rho = 0$.
	The conjugate of $\phi_{\text{EC}}$ is
	\[
		\phi_{\text{EC}}^*(s) =
		\begin{cases}
			(1-\alpha) s & s < 0 \\
			\frac{1}{1-\beta} s & s \geq 0.
		\end{cases}
	\]
	Noting that $0<1-\alpha < 1 < \frac{1}{1-\beta}$, we can rewrite the conjugate as $\phi_{\text{EC}}^*(s) = \max \left\{ (1-\alpha) s, \frac{1}{1-\beta} s \right\} = (1-\alpha) s + \left( \frac{1}{1-\beta} - (1-\alpha) \right) [s]^+$. 
	This gives the dual problem
	\begin{align*}
		& \min_{\lambda \geq 0,\mu}  \mu + \rho \lambda + \lambda \sum_\omega q_\omega \phi^*\left(\frac{h_\omega(\x)-\mu}{\lambda}\right) \\
		= & \min_\mu \mu + \sum_\omega q_\omega \max \left\{ (1-\alpha)(h_\omega(\x)-\mu), \frac{(h_\omega(\x)-\mu)}{1-\beta} \right\} \\
		= & \min_\mu \mu + (1-\alpha)\e{\h(\x)-\mu} + \left( \frac{1}{1-\beta} - (1-\alpha) \right) \e{[\h(\x)-\mu]^+} \\ 
		= & (1-\alpha)\e{\h(\x)} + \alpha \min_\mu \left\{ \mu + \left(1 - \frac{\beta}{\alpha(1-\beta)+\beta}\right)^{-1} \e{[\h(\x)-\mu]^+} \right\}. 
	\end{align*}
%	Now, CVaR includes a $\e{[\cdot]^+}$ term, which can be found in the maximum formula.
	Then, using the definition of CVaR, we obtain
	$%\[
		(1-\alpha)\e{\h(\x)} + \alpha \mbox{CVaR}_{\frac{\beta}{\alpha(1-\beta)+\beta}}[\h(\x)]. 
	$%\] 
	\Halmos 
%
%	\bigskip 
%
%	Noting that $1-\alpha < 1 < \frac{1}{1-\beta}$, rewrite the maximum as
%	\begin{align*}
%		\max & \left\{ (1-\alpha)(h_\omega(\x)-\mu), \frac{(h_\omega(\x)-\mu)}{1-\beta} \right\} \\
%		& = (1-\alpha)(h_\omega(\x)-\mu) + \left( \frac{1}{1-\beta} - (1-\alpha) \right) \left[ h_\omega(\x)-\mu \right]^+.
%	\end{align*}
%	Then working with the linear term and writing in terms of expectations we get
%	\[
%		(1-\alpha)\e{h(\x)} + \min_\mu \alpha\mu + \left( \frac{1}{1-\beta} - (1-\alpha) \right) \e{h(\x)-\mu}^+
%	\]
%	which simplifies to
%	\[
%		(1-\alpha)\e{h(\x)} + \alpha \min_\mu \left\{ \mu + \left(1 - \frac{\beta}{\alpha(1-\beta)+\beta}\right)^{-1} \e{h(\x)-\mu}^+ \right\}
%	\]
%	and thus
%	\[
%		(1-\alpha)\e{h(\x)} + \alpha \mbox{CVaR}_{\frac{\beta}{\alpha(1-\beta)+\beta}}[h(\x)]. \Halmos
%	\]
\end{proof}

 
 \end{APPENDICES}


% References here (outcomment the appropriate case) 

% CASE 1: BiBTeX used to constantly update the references 
%   (while the paper is being written).
%\bibliographystyle{ijocv081} % outcomment this and next line in Case 1
\bibliography{love_lro} % if more than one, comma separated

% CASE 2: BiBTeX used to generate mypaper.bbl (to be further fine tuned)
%\input{mypaper.bbl} % outcomment this line in Case 2

%\section*{Author Biographies}

%\noindent {\bf DAVID LOVE} is a graduate student in the Graduate Interdisciplinary Program in Applied Mathematics at the University of Arizona.
%His research interests include distributionally robust stochastic programming and water resources management.
%His email address is \url{dlove@math.arizona.edu} and his web page is \url{http://math.arizona.edu/~dlove}.

%\bigskip

%\noindent {\bf G\"{U}ZIN BAYRAKSAN} is an Associate Professor of Integrated Systems Engineering at the Ohio State University.
%Her research interests include Monte Carlo sampling methods for stochastic programming and applications to water resources.
%Her email address is \url{bayraksan.1@osu.edu} and her web page is \url{http://www-iwse.eng.ohio-state.edu/biosketch_GBayraksan.cfm}

\end{document}