\documentclass{beamer}
\usepackage{tikz}
\usepackage{amsbsy}
\usepackage{hyperref}
\usetikzlibrary{positioning,shadows,arrows,shapes,calc}
\def\labelenumi\theenumi
\usepackage{devanagari}
\usepackage{multimedia}
\usepackage[english]{babel}\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{stackrel}
\newcommand{\nop}{\rule{0in}{0in}}
\usepackage{amsmath}\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\usepackage{amsmath}\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\mode{\usetheme{Frankfurt}}
\AtBeginSection[]
{
\begin{frame}
\frametitle{Outline}
\tableofcontents[currentsection,currentsubsection]
\end{frame}
}
\title{Three Proposals for Fairness in Speech Recognition}
\author{Mark Hasegawa-Johnson and Heting Gao}
\begin{document}
% Title
\begin{frame}
\maketitle
\end{frame}
% Title
\begin{frame}
\tableofcontents
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section[Speech]{Fairness in Speech Recognition}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{Speech Recognition}
Speech recognition is the task of converting $\pmb{X}$ to $\pmb{Y}$, where
\begin{itemize}
\item $\pmb{X}=[X_1,\ldots,X_T]$ is a sequence of random spectra
whose instance value is $\mathbf{x}=[x_1,\ldots,x_T]$,
$x_t\in{\mathcal X}$.
\item $\pmb{Y}=[Y_1,\ldots,Y_S]$ is a sequence of phones, words, or
characters whose instance value is $\mathbf{y}=[y_1,\ldots,y_S]$,
$y_s\in{\mathcal Y}$, $S\le T$.
\end{itemize}
The problem we have: speech recognition is less accurate for people
with high-pitched voices (women), less accurate for people with
dialects that it has not seen during training (African American
English, Hispanic English, Indian English, Bronx, Cockney,
\ldots), and less accurate for people with some types of
disabilities (e.g., Cerebral Palsy, Parkinson's Disease, ALS).
\end{frame}
\begin{frame}{Data Sources}
\begin{small}
\begin{itemize}
\item AA (CORAAL): \url{https://oraal.uoregon.edu/coraal}
\item AF (AST Afrikaans English) \url{https://vlo.clarin.eu/record/https_58__47__47_hdl.handle.net_47_20.500.12185_47_411_64_format_61_cmdi?2}
\item AM (Librispeech): \url{http://www.openslr.org/12}
\item BR (Cambridge Read News): LDC95S24
\item IN (maheshchandra-20160719-cgc, maheshchandra-20160719-com, maheshchandra-20160719-e01) \url{http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/8kHz_16bit}
\item SP (Hisp-eng): LDC2014S05
\item XH (AST Black English): \url{https://repo.sadilar.org/handle/20.500.12185/433}
\item UA: \url{http://ifp-08.ifp.uiuc.edu/protected/UASPEECH} (Password: please ask Mark.
Participants gave permission for
research use, but not commercial use or redistribution.)
\end{itemize}
\end{small}
\end{frame}
\begin{frame}{Experiments\\{\small Gao, 2020}}
\begin{itemize}
\item AA for African American, AF for Afrikaans, AM for American, BR for British, IN for Indian, SP for Spanish in central-south America and XH for Xhosa-Zulu-Sotho.
\begin{center}
\scriptsize{
\begin{tabular}{ |c|c|c|c|c|c|c|c|} \hline
& \textbf{Base20000} & \multicolumn{2}{c|}{\textbf{Base3000}} & \multicolumn{3}{c|}{\textbf{BaseIndv3000}} \\ \hline
Dialect & CTCLoss & CTCLoss & CER & Train Size & CTCLoss & CER \\ \hline
AA & 2.43 & 2.47 & 67.71\% & 3000 & 2.45 & 68.91\% \\ \hline
AF & 1.69 & 1.70 & 49.66\% & 3000 & 1.14 & 26.87\% \\ \hline
AM & 1.61 & 1.62 & 49.34\% & 3000 & 1.39 & 41.43\% \\ \hline
BR & 1.79 & 1.83 & 54.07\% & 3000 & 1.62 & 47.96\% \\ \hline
IN & 2.28 & 2.40 & 61.16\% & 300 & 4.05 & 93.11\% \\ \hline
SP & 2.08 & 2.07 & 56.86\% & 200 & 2.86 & 79.99\% \\ \hline
XH & 1.94 & 2.00 & 55.78\% & 2619 & 1.75 & 46.83\% \\ \hline
\end{tabular}
}
\end{center}
\item Difficulty varies across different dialect (AA)
\item Training on Individual Language can reduce the error rate (AF, AM, BR, XH).
\item Sample size affect accuracy (IN)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Fairness in AI: General Definitions}
Our goal is to estimate $Y$ given $X$, where $X$ is stuff we know
(e.g., employment history) and $Y$ is something we'd like to know
(e.g., probability you'll default on your mortgage). We want to
find $\hat{Y}(X)$, a neural net, that is ``fair'' to people with
different values of some protected attribute $A$ (e.g., gender,
race, dialect, disability). What does ``fair'' mean?
\begin{enumerate}
\item {\bf Demographic Parity}
\[ P(\hat{Y}=y|A=a)=P(\hat{Y}=y|A=a')~~\forall y,a,a' \]
\item {\bf Equal Opportunity}
\[ P(\hat{Y}=y|A=a,Y=y)=P(\hat{Y}=y|A=a',Y=y)~~\forall y,a,a' \]
\item {\bf Counterfactual Fairness}
\[ P(\hat{Y}_{A\leftarrow a}(U)=y|Y=y,A=a)=P(\hat{Y}_{A\leftarrow a'}(U)=y|Y=y,A=a) \]
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Demographic Parity}
\[ P(\hat{Y}=y|A=a)=P(\hat{Y}=y|A=a')~~\forall y,a,a' \]
\begin{itemize}
\item Pro:
\begin{itemize}
\item Easy to compute.
\item Useful if the ground truth labels, $Y=y$, depend on $A$ in
a way that is historically unfair, and you don't want to
reproduce that unfairness.
\end{itemize}
\item Con:
\begin{itemize}
\item Useless if demographic parity is a socially undesirable
outcome.
\item Population measure, not an individual measure: Doesn't
measure the degree to which any particular individual is
treated fairly.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Demographic Parity Fails for Speech}
\[ P(\hat{Y}=y|A=a)\ne P(\hat{Y}=y|A=a')~~\forall y,a,a' \]
\begin{itemize}
\item DP is inappropriate for speech. Blacks and whites/men and
women/Irish and English people say different things. We don't
want an ASR that forces them to have the same posterior
probabilities.
\item This fails even if you apply it to individual phonemes.
Different dialects use different words (and therefore different
characters), and use different phonemes even to say the same
words.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Equal Opportunity}
\[ P(\hat{Y}=y|A=a,Y=y)=P(\hat{Y}=y|A=a',Y=y)~~\forall y,a,a' \]
\begin{itemize}
\item Pro:
\begin{itemize}
\item Easy to compute.
\item Useful if we trust the ground truth labels, $Y=y$, and
we want our algorithm to reproduce them with equal accuracy for
all demographic groups.
\end{itemize}
\item Con:
\begin{itemize}
\item Useless if the dependence of $Y$ on $A$ is already
unfair.
\item Population measure, not an individual measure: Doesn't
measure the degree to which any particular individual is
treated fairly.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Equal Opportunity Works for Speech}
\[ P(\hat{Y}=y|A=a,Y=y)\ne P(\hat{Y}=y|A=a',Y=y)~~\forall y,a,a' \]
\begin{itemize}
\item This is exactly what we want: equal accuracy for all values
of $A$.
\item It's still a population measure: it doesn't measure the
accuracy for any given individual in a population.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The ``Death in Childbirth'' Clause}
\begin{itemize}
\item In 1900, almost 1\% of all women died in childbirth. 0\% of
men died in childbirth. The solution was not to increase the
number of men who died during childbirth; the solution was to
decrease the number of women who died that way.
\item Similarly, in the pursuit of fairness, if we ever develop a
loss function that says ``increase fairness by decreasing the
error rate for group A, and increasing the error rate for group
B,'' then we should just delete the second term in the loss
function.
\item
Treat it as a minimax problem (minimize the maximum of two error
rates).
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section[CTC]{Connectionist Temporal Classification}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{Connectionist Temporal Classification\\{\small Graves,
Fern\'{a}ndez, Gomez \& Schmidhuber, ICML 2016}}
\begin{itemize}
\item $\theta=$ model parameters.
\item {\bf Spectrogram:} $\mathbf{X}=[X_1,\ldots,X_T]=$ sequence of
random spectra whose instance value is
$\mathbf{x}=[x_1,\ldots,x_T]$, $x_t\in{\mathcal X}$.
\item {\bf Transcription:} $\mathbf{Y}=[Y_1,\ldots,Y_S]=$ sequence
of characters whose instance value is
$\mathbf{y}=[y_1,\ldots,y_S]$, $y_s\in{\mathcal Y}$, $S\le T$.
\item {\bf Time-Aligned Transcription:}
$\mathbf{\Pi}=[\Pi_1,\ldots,\Pi_T]=$sequence of time-aligned
characters whose instance value is
$\pmb{\pi}=[\pi_1,\ldots,\pi_T]$, where each time-aligned
character is either one of the transcription characters
($\pi_t=y_s$ for some $s$), or else $\pi_t=\varnothing$.
$\varnothing$ is a special ``blank'' character. For example,
suppose an $S=5$-character text (``hello'') is encoded in a
$T=14$-frame speech waveform, it might be:
\[
\pmb{y}=[h,e,l,l,o],~~~\pmb{\pi}=[h,h,e,e,e,\varnothing,\varnothing,l,l,l,\varnothing,l,\varnothing,o]
\]
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The time-compression function}
In order to compress $\pmb{\pi}=[\pi_1,\ldots,\pi_T]$ down to the
output length $\pmb{y}=[y_1,\ldots,y_S]$, for $S\le T$, we need a
time-compression function:
\begin{itemize}
\item ${\mathcal B}:\left({\mathcal Y}\cup\left\{\varnothing\right\}\right)^{+}\rightarrow{\mathcal Y}^{+}$ is a surjective function:
\begin{itemize}
\item For any valid $\pmb{\pi}$, ${\mathcal B}(\pmb{\pi})$ is a unique $\pmb{y}$.
\item For any valid $\pmb{y}$, ${\mathcal B}^{-1}(\pmb{y})$ is a set of possible values of $\pmb{\pi}$.
\end{itemize}
\end{itemize}
The most common ${\mathcal B}$-function (the one invented by Graves
et al.) does this:
\centerline{\begin{tabular}{rp{1cm}|l}\hline
Start with a length-$T$ sequence: && $\pmb{\pi}=$hheee$\varnothing\varnothing$lll$\varnothing$l$\varnothing$o\\\hline\hline
(1) eliminate duplicates: && he$\varnothing$l$\varnothing$l$\varnothing$o\\\hline\hline
(2) eliminate blanks: && $\pmb{y}=$hello\\\hline
\end{tabular}}
\end{frame}
\begin{frame}
\frametitle{The CTC loss function}
A CTC-based speech recognizer is trained to minimize the CTC loss. The CTC loss
is the cross-entropy (negative log probability) of the correct label sequence:
\[ {\mathcal L}_{CTC}(\theta)=-\ln P(\pmb{\hat{Y}}=\pmb{y}|\pmb{X}=\pmb{x}) \]
\[ = -\ln\sum_{\pmb{\pi}\in{\mathcal B}^{-1}(\pmb{y})}\prod_{t=1}^T q_t(\pi_t) \]
\[ = -\mbox{logsumexp}_{\pmb{\pi}\in{\mathcal B}^{-1}(\pmb{y})}\sum_{t=1}^T \ln q_t(\pi_t) \]
where $ q_t(\pi_t)$ is the output of a softmax layer at time $t$.
The input of this softmax layer is a bLSTM, Transformer, or some
other neural net parameterized by $\theta$ and having access to the
whole sequence $\pmb{x}$, so
\begin{displaymath}
q_t(\pi) = P(\hat{\Pi}_t=\pi|\pmb{x},\theta)
\end{displaymath}
\end{frame}
\begin{frame}
\frametitle{How to compute CTC loss: turn it into EM}
Taking advantage of
$\frac{d}{d\theta}\ln{f}(\theta)=\frac{1}{f(\theta)}\frac{df}{d\theta}$,
you can prove that
\begin{displaymath}
\nabla_\theta{\mathcal{L}}_{CTC}(\theta)=\nabla_\theta{\mathcal{L}}_{EM}(\theta,\bar\theta)
\end{displaymath}
where $\mathcal{L}_{EM}$ is the weighted average, over all frames
$t$ and over all characters $y$, of $-\ln q_t(y)$:
\begin{displaymath}
{\mathcal{L}}_{EM}(\theta,\bar\theta)=
- \sum_{y}\sum_t \gamma_t(y) \ln q_t(y)
\end{displaymath}
The weighting function is the alignment posterior, defined as
\begin{displaymath}
\gamma_t(y)
= P(\Pi_t=y|\pmb{x},\pmb{y},\theta)
= \frac{\sum_{\pmb{\pi}\in{\mathcal B}^{-1}(\pmb{y}),\pi_t=y}
\exp\left(\sum_{t'=1}^T\ln q_{t'}(\pi_{t'})\right)}
{\sum_{\pmb{\pi}\in{\mathcal B}^{-1}(\pmb{y})}\exp\left(\sum_{t'=1}^T\ln q_{t'}(\pi_{t'})\right)}
\end{displaymath}
(Remember that $ q_t(y)$ is the recognition posterior:)
\begin{displaymath}
q_t(y)=P(\Pi_t=y|\pmb{x},\theta )
\end{displaymath}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section[1:EOA]{Proposal \#1: Equal Opportunity Accuracy}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{{\large Deep F-measure Maximization for End-to-End Speech Understanding}\\{\small Sar\i \& Hasegawa-Johnson, Interspeech 2020}}
Maximum-accuracy training biases a DNN toward the majority class.
Reduced bias is achieved by trying to optimize the harmonic average
of precision and recall, averaged across all classes.
\begin{itemize}
\item $N_k$=true number of tokens in class $k$
\item $TP(k)=$ true positive detections of class $k$
\item $FP(k)=$ false positive detections of class $k$
\item $\mbox{Rec}(k)=TP(k)/N_k=$ recall of class $k$
\item $\mbox{Prec}(k)=TP(k)/(TP(k)+FP(k))=$ precision of class $k$
\item $F_\beta(k)=\frac{(1+\beta^2)\mbox{Rec}(k)\mbox{Prec}(k)}{\beta^2\mbox{Prec(k)}+\mbox{Rec}(k)}=$
$\beta$-weighted harmonic average of precision and recall for class $k$.
\item Macro-averaged $F_\beta$ measure:
\[
F_\beta=\frac{1}{K}\sum_{k=1}^K\frac{(1+\beta^2)TP(k)}{\beta^2N_k+(TP(k)+FP(k))}
\]
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{{\large Deep F-measure Maximization for End-to-End Speech Understanding}\\{\small Sar\i \& Hasegawa-Johnson, Interspeech 2020}}
$F_\beta$ requires counting the number of true positives and false positives; this is not a
differentiable operation. A differentiable approximation can be achieved as follows:
\[
TP(k)\approx \sum_{n\in S_k}q_n'(k),~~~FP(k)\approx\sum_{n\not\in S_k}q_n'(k)
\]
where $S_k$ is the set of training tokens from class $k$, $S$ is the
total training dataset, and $q_n'(k)$ is the renormalized softmax
output of the classifier,
\[
q_n'(k)=\frac{q_n(k)}{\max_{l}q_n(l)}
\]
Therefore
\[
{\mathcal L}=-\frac{1}{K}\sum_{k=1}^K\frac{(1+\beta^2)\sum_{n\in S_k}q_n'(k)}{\beta^2N_k+\sum_{n\in S}q_n'(k)}
\]
\end{frame}
\begin{frame}
\frametitle{Soft Counts for Equal Opportunity Training\\{\small Mina, 2020}}
An Equal-Opportunity AI is one that satisfies
\[ \left|P(\hat{Y}=y|A=a,Y=y)-P(\hat{Y}=y|A=a',Y=y)\right|=0~~\forall y,a,a' \]
Rusty Mina proposed estimating the probabilities using Deep-F-style soft counts:
\[
P(\hat{Y}=y|A=a,Y=y)=\frac{\sum_{n\in S_{y,a}}q_n'(y)}{|S_{y,a}|}
\]
where $S_{y,a}$ is the data subset with labels $Y=y,A=a$ and
$|S_{y,a}|$ is its cardinality. Equal Opportunity is achieved by
setting the following loss function to zero:
\[{\mathcal L}_{EO}=\sum_y\sum_{a,a'}\left|
\frac{\sum_{n\in S(y,a)}q_n'(y)}{|S_{y,a}|}-\frac{\sum_{n\in S(y,a')}q_n'(y)}{|S_{y,a'}|}\right|
\]
\end{frame}
\begin{frame}
\frametitle{Soft-Count Equal Opportunity for Speech}
\begin{displaymath}
{\mathcal L}_{EO}=\sum_y\sum_{a,a'}\left|
\frac{\sum_{n\in S(y,a)}q_n'(y)}{|S_{y,a}|}-\frac{\sum_{n\in S(y,a')}q_n'(y)}{|S_{y,a'}|}\right|
\end{displaymath}
In speech, $\pmb{y}=[y_1,\ldots,y_S]$. Here are three main
possibilities:
\begin{enumerate}
\item {\bf Matched Transcriptions:} $S(\pmb{y},a)$, could be defined
as ``the set of waveforms that have exactly the same
transcription.'' Only works for UASPEECH and TIMIT.
\item {\bf Matched Frames}: $S(y,a)$ could be defined as ``the set
of frames for which the recognizer should output character $y$.''
The problem is, we have no time alignment, so we don't know which
frames those are.
\item {\bf Equal Opportunity Accuracy:} In speech, the metric
$P(\hat{Y}|Y)$ is just sentence accuracy. We could demand that
accuracy is the same for different demographic groups, even if
they never say exactly the same things.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Equal Opportunity Accuracy}
Sentence accuracy is normally defined, in ASR, as the probability
that the recognizer output transcription $\pmb{\hat{Y}}$ exactly
matches the correct transcription $\pmb{y}$, averaged over all
transcribed utterances $(\pmb{x},\pmb{y})$ in the training dataset.
Sentence accuracy is equal, for every pair of demographic attributes
$(a,a')$, if the following loss function is zero:
\begin{displaymath}
{\mathcal L}_{EOA}= \sum_{a,a'}\left|
\frac{1}{|S_a|}\sum_{(\pmb{x},\pmb{y})\in S_a} P(\pmb{\hat{Y}}=\pmb{y}|\pmb{x}) -
\frac{1}{|S_{a'}|}\sum_{(\pmb{x},\pmb{y})\in S_{a'}} P(\pmb{\hat{Y}}=\pmb{y}|\pmb{x})
\right|
\end{displaymath}
where $S_a=\left\{(\pmb{x},\pmb{y})|A=a\right\}$.
\end{frame}
\begin{frame}
\frametitle{The ``Death in Childbirth'' Clause}
As defined on the previous slide, ${\mathcal L}_{EOA}$ has a ``death
in childbirth'' problem: it specifically encourages the recognizer
to {\bf increase the error rate} of the more accurately-recognized
demographic. Instead, let's modify it to focus exclusively on {\bf
decreasing the error rate} of the less-accurately recognized
demographic:
\begin{displaymath}
{\mathcal L}_{EOA}= \sum_{a_1,a_2\sim A}\max_{a\in\left\{a_1,a_2\right\}}\left(
-\frac{1}{|S_a|}\sum_{(\pmb{x},\pmb{y})\in S_a} P(\pmb{\hat{Y}}=\pmb{y}|\pmb{x})\right)
\end{displaymath}
\end{frame}
\begin{frame}
\frametitle{Integration with CTC}
${\mathcal L}_{EOA}$ is now just a weighted form of ${\mathcal
L}_{CTC}$: it gives more weight to $(\pmb{x},\pmb{y})$ pairs that
come from a demographic group $A=a$ that currently has worse error
rates. Just like CTC, $\nabla_\theta{\mathcal
L}_{EOA}(\theta)=\nabla_\theta{\mathcal
L}_{WEM}(\theta,\bar\theta)$, where ${\mathcal L}_{WEM}$ is a
demographically-weighted EM measure defined as:
\begin{displaymath}
{\mathcal{L}}_{WEM}(\theta,\bar\theta)=
\sum_{a_1,a_2\sim A}\max_{a\in\left\{a_1,a_2\right\}}\left(
-\frac{1}{|S_a|}\sum_{y}\sum_{t\in S_a} \gamma_t(y) \ln q_t(y)
\right)
\end{displaymath}
where $\gamma$ and $ q$ should (I think) have the demographic
features as inputs:
\begin{displaymath}
q_t(y)=P(\Pi_i=y|\mathbf{x},a,\theta)
\end{displaymath}
\begin{displaymath}
\gamma_t(y)=P(\Pi_i=y|\mathbf{x},\mathbf{y},a,\bar\theta)
\end{displaymath}
\end{frame}
\begin{frame}{Training Procedure}
\begin{enumerate}
\item Maybe start with a few epochs of standard CTC:
\begin{displaymath}
{\mathcal{L}}_{EM}(\theta,\bar\theta)=
- \sum_{y}\sum_t \gamma_t(y) \ln q_t(y)
\end{displaymath}
\item Then assigning weights:
\begin{displaymath}
{\mathcal{L}}_{WEM}(\theta,\bar\theta)=
\sum_{a_1,a_2\sim A}\max_{a\in\left\{a_1,a_2\right\}}\left(
-\frac{1}{|S_a|}\sum_{y}\sum_{t\in S_a} \gamma_t(y) \ln q_t(y)
\right)
\end{displaymath}
\begin{displaymath}
=\sum_{a\sim A}\left(
-\frac{N_{\le a}}{|S_a|}\sum_{y}\sum_{t\in S_a} \gamma_t(y) \ln q_t(y)
\right)
\end{displaymath}
where $N_{\le a}=$ \# other groups that have lower loss than group
$a$. I think that loss measure should actually be pretty robust,
and can probably run until convergence.
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section[2:CFMI]{Proposal \#2: Counterfactual Fairness based on Mutual Information}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{Counterfactual Fairness\\{\small (Kusner, Loftus, Russell \& Silva, NIPS 2017)}}
\[ P(\hat{Y}_{A\leftarrow a}(U)=y|X=x,A=a)=P(\hat{Y}_{A\leftarrow a'}(U)=y|X=x,A=a) \]
\begin{itemize}
\item Pro:
\begin{itemize}
\item Useful regardless of whether or not the dependence of $Y$ on $A$ is
historically biased.
\item Characterizes the individual by protected attribute $A=a$,
other observable attributes $X=x$, and underlying latent
attributes $U=u$. Demands fairness for every individual:
demands that $\hat{Y}$ be unchanged, even if $A$ were changed
to $a'$, keeping all other attributes (even the unknown latent
attributes $U=u$) the same.
\end{itemize}
\item Con:
\begin{itemize}
\item In speech recognition (at least), it seems to have many different possible
interpretations.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{How to compute a ``counterfactual'' posterior:}
\[ P(\hat{Y}_{A\leftarrow a}(U)=y|Y=y,A=a)=P(\hat{Y}_{A\leftarrow a'}(U)=y|Y=y,A=a) \]
\begin{enumerate}
\item {\bf Abduction:} Assume latent variables $U$ with some prior
$P(U)$. Given each training datum $(X=x,Y=y,A=x)$, compute the
posterior $P(U|X=x,Y=y,A=a)$.
\item {\bf Action:} Set $A$ to $a'$ (don't recompute any
probabilities yet).
\item {\bf Prediction:} Compute
\[ P(\hat{Y}_{A\leftarrow a'}(U)=y|Y=y,A=a) = \]
\[ \sum_u P(U=u|X=x,Y=y,A=a)P(\hat{Y}=y|X=x,U=u,A=a')\]
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Levels of increasing modeling power} The following types
of models are counterfactually fair. (Kusner et al.) claim that
models at higher levels of the following ontology can predict $Y$
with better precision than lower-level models:
\begin{enumerate}
\item Build $\hat{Y}$ using only observable non-descendants of $A$.
Problem: in real data, most observables are influenced by $A$.
\item Postulate latent variables with some prior $P(U)$, estimate
$P(X|A,U)$ from the data, then abduct $P(U|X=x,Y=y,A=a)$ for each
training example.
\item Postulate a fully deterministic model with latent ``residual''
variables $e\sim E$ such that
\begin{enumerate}
\item $x=f(a,e)$ is a deterministic function, e.g., linear
regression $x=wa+b+e$ with constant weight $w$ and bias $b$, and
\item $E$ is independent of $A$.
\end{enumerate}
Then train $\hat{Y}=g(e)$, independent of $A$ by design.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Counterfactual Fairness and Speech}
\begin{itemize}
\item Level-3 counterfactual fairness is inappropriate for speech,
for the same reason that DP is inappropriate: We don't want
$\hat{Y}=g(e)$ for $E$ independent of $A$, because we don't want
$\hat{Y}$ to be independent of $A$ (blacks and whites say
different things; we want to preserve that difference, not ignore
it).
\item Level-2 can model this. Assume $P(U)$, then
\begin{enumerate}
\item learn $P(X|A,Y,U)$,
\item abduct $P(U|X=x,Y=y,A=a)$ for each datum,
\item predict $P(\hat{Y}|U=u,X=x,A=a')$.
\end{enumerate}
The problem is how to design $U$ so that it contains all the
information that $X$ carries about $Y$, including any information
about $A$ that is embedded in the relationship between $X$ and
$Y$, but none of the extra information about $A$ that might be in
$X$ but absent in $Y$.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Counterfactual fairness and speech}
The problem is how to design $U$ so that:
\begin{itemize}
\item it contains all the information that $X$ carries about $Y$,
including any information about $A$ that is embedded in the
relationship between $X$ and $Y$. In terms of mutual information,
we want $I(X,Y)=I(U,Y)$. If $U$ is a deterministic function of
$X$, then there is a theorem that says $I(X,Y)\ge I(U,Y)$ always,
so our goal is to make $I(U,Y)$ as large as possible.
\item it contains none of the extra information about $A$ that might
be in $X$ but absent in $Y$. In terms of mutual information, we
want $I(Y,(U,A))=I(Y,A)$. There is a theorem that says $I(Y,(U,A))\ge
I(Y,A)$ always, so our goal is to make $I(Y,(U,A))$ as small as
possible.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Counterfactual Fairness based on Mutual Information}
Suppose we constrain $U=g(X,A)$ to be a deterministic function of
$X$ and $A$ (so that it can be computed during test time). Then
\begin{itemize}
\item We want $I(U,Y)$ as large as possible:
\[ U = \arg\max I(U,Y) = \arg\max H(Y)-H(Y|U) \]
\item We want $I((U,A),Y)$ as small as possible:
\[ U = \arg\min I((U,A),Y) = \arg\min H(Y)-H(Y|U,A) \]
\end{itemize}
Thus $U=g(X,A)$ is trained to minimize
\begin{displaymath}
{\mathcal L}_{CF}=H(Y|U)-H(Y|U,A)
\end{displaymath}
\end{frame}
\begin{frame}
\frametitle{Counterfactual Fairness based on Mutual Information}
$\pmb{U}=g(\pmb{X},A)$ is trained to minimize
\begin{displaymath}
{\mathcal L}_{CF}=H(\pmb{Y}|\pmb{U})-H(\pmb{Y}|\pmb{U},A)
\end{displaymath}
Putting that into the CTC framework, we want $U=g(X,A)$ trained to minimize
\begin{displaymath}
{\mathcal{L}}_{CFMI}=
-\sum_{y}\sum_t\gamma_t(y)\left(\ln q_t(y|\mathbf{u})-\ln q_t(y|\mathbf{u},a)\right)
\end{displaymath}
where
\begin{itemize}
\item $\pmb{u}(\pmb{x},a)$ is a bLSTM layer.
\item $ q_t(y|\pmb{u})$ and $ q_t(y|\pmb{u},a)$ are two separate
branches after the layer $\pmb{u}$, each separately pre-trained to
optimize ${\mathcal L}_{CTC}$.
\item After $ q_t(y|\pmb{u})$ and $ q_t(y|\pmb{u},a)$ have been
pre-trained, then we fix them, and re-train $\pmb{u}(\pmb{x},a)$
in order to minimize
\begin{displaymath}
(1-\lambda){\mathcal{L}}_{EM}+\lambda{\mathcal{L}}_{CFMI}=
-\sum_{y}\sum_t\gamma_t(y)\left(\ln q_t(y|\mathbf{u})-\lambda\ln q_t(y|\mathbf{u},a)\right)
\end{displaymath}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section[3:SMOTE]{Proposal \#3: SMOTE and EO}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{Synthetic minority oversampling technique\\{\small Chawla, Boyle, Hall \& Kegelmeyer, JAIR 2002}}
SMOTE is a data augmentation trick. I'm using it here to exemplify
all possible data augmentation and active learning methods. It
usually outperforms most other methods.
SMOTE generates artificial data for the minority class by iterating
the following process, until all classes have equal numbers of
samples:
\begin{enumerate}
\item Choose, at random, two samples from the minority class, $x_1$
and $x_2$.
\item Choose a number $\lambda$ uniformly at random between 0 and 1.
\item Generate a new training sample as $\lambda
x_1+(1-\lambda)x_2$.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{SMOTE for Time-Series Data?}
\begin{itemize}
\item SMOTE was proposed for vector data. It has rarely been used
for time-series data, and never (as far as I know) to generate
synthetic data for ASR. The ``curse of dimensionality'' makes the
process $\lambda x_1+(1-\lambda)x_2$ unreliable for generating
realistic training examples.
\item Zhu, Lin \& Liu (\url{https://arxiv.org/pdf/2004.06373.pdf}, 2020)
propose OHIT (Oversampling High-dimensional Imbalanced
Time-series), which uses robust covariance estimators to cluster
the minority class samples prior to SMOTE.
\item Speech would need one more step: use dynamic time warping
(DTW) to time-align the two examples, prior to SMOTE. As far as I
know, it has never been tried, and should be tried.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{SMOTE for ASR?}
For example, SMOTE for ASR could do this:
\begin{enumerate}
\item Choose, at random, two samples from the minority class, $x_1$
and $x_2$, with the same text $Y=y$. Use dynamic time warping to
align them.
\item Choose a number $\lambda$ uniformly at random between 0 and 1.
\item Generate a new training sample as $\lambda
x_1+(1-\lambda)x_2$ (interpolate the MFCCs, not the spectra).
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{SMOTE and EO}
Equal opportunity says that we want
\[ P(\hat{Y}=y|A=a,Y=y)=P(\hat{Y}=y|A=a',Y=y)~~\forall y,a,a' \]
So instead of choosing utterances $x_1$ and $x_2$ uniformly at
random, we could choose them with a probability proportional to
\[
P(\mbox{choose}~y|a)\propto \max\left(0,\max_{a'\ne a}P(y|a')-P(y|a)\right)
\]
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\section[Concl]{Conclusions}
\setcounter{subsection}{1}
\begin{frame}
\frametitle{Conclusions}
I've sketched four possible methods to make ASR more fair:
\begin{enumerate}
\item Equal Opportunity Accuracy: results in a one-part loss
function, a weighted average of CTC losses in different
demographics.
\item Counterfactual Fairness based on Mutual Information: results
in a two-part loss function, very similar to adversarial training,
but with a different definition of the adversary.
\item Data Augmentation for Equal Opportunity: create augmented data
based on the samples that are treated least fairly by the ASR.
\end{enumerate}
\end{frame}
\end{document}