\documentclass{llncs}
\input{psfig.sty}
\usepackage{amsmath}
\usepackage{epsfig}
\usepackage{amssymb}
\begin{document}
\pagestyle{empty}
\mainmatter
\title{Analysis of information in speech and its application in speech recognition}
\author{Sachin S. Kajarekar\inst{1} \and Hynek Hermansky\inst{1,2}}
\institute{Oregon Graduate Institute of Science and Technology, \\
Beaverton, OR, USA.\\
\email{\{sachin,hynek\}@ece.ogi.edu} \\
\texttt{http://www.asp.ece.ogi.edu/index.html}
\and
International Computer Science Institute, \\
Berkeley, CA, USA. }
\maketitle
\begin{abstract}
Previous work analyzed the information in speech using analysis of variance
(ANOVA). ANOVA assumes that sources of information (phone, speaker, and
channel) are univariate gaussian. The sources of information, however, are
not unimodal gaussian. Phones in speech recognition, e.g., are generally
modeled using a multi-state, multi-mixture model. Therefore, this work
extends ANOVA by assuming phones with 3 state, single mixture distribution
and 5 state, single mixture distribution. This multi-state model was
obtained by extracting variability due to position within phone from the
error term in ANOVA. Further, linear discriminant analysis (LDA) is used to
design discriminant features that better represent both the phone-induced
variability and the position-within-phone variability. These features
perform significantly better than conventional discriminant features
obtained from 1-state phone model on continuous digit recognition task.
\end{abstract}
\section{Introduction}
Speech signal and features extracted from the signal contain various sources
of information. For speech and speaker recognition, phone, speaker, and
communication channel (henceforth referred as channel) are the most
important sources. Previous work \cite{snh991,snh992}, analyzed information
in speech using analysis of variance (ANOVA) \cite{ht96}. The contribution of phone,
speaker and channel was estimated in spectral and temporal domains.
As explained in section 2, ANOVA assumes that sources have unimodal gaussian
distribution\footnote{
Information is proportional to variance if sources have gaussian
distribution \cite{ct91}.}. Sources of information in speech however are more complex,
e.g., phones are typically modeled using a multi-state, multi-mixture model.
In this paper, therefore, we extend ANOVA to measure phone information
assuming such multi-state gaussian distribution. We show that sources can be
recombined into useful and harmful sources. Thus, e.g. for speaker
independent automatic recognition of speech (ASR), phone information
represents the useful information and all other sources contribute harmful
information. This relates ANOVA to our work on discriminant features (e.g.,
FIR RASTA filters in temporal domain \cite{sarel97} and discriminant bases
in spectral domain \cite{naren98b}) using linear discriminant analysis
(LDA). We show that improvement in the estimate of phone information could
lead to more robust features.
The paper is organized as follows. In section 2, we give a brief overview of
ANOVA and extend it for multi-state sources. This is followed by a short
description of LDA. In section 3, experimental setup is described and in
section 4, we present results of ANOVA and LDA. We conclude with summary of
our work in section 5.
\section{ANOVA and LDA}
\subsection{ANOVA}
ANOVA assumes that sources have unimodal gaussian distributions. From given
data, sources are segregated as follows. First, mean and variance of each
source is estimated and any observation $X_{ijkl}$ is expressed as a linear
combination of means of sources -- phone (i), speaker (j), channel (k), their
interactions and an error term
\begin{equation}
X_{ijk}=\bar{X}_{....}+\bar{X}_{i...}+\bar{X}_{.j..}+\bar{X}_{..k.}+\bar{X}_{ij..}+\bar{X}_{.jk.}+\bar{X}_{i.k.}+\bar{X}_{ijk.}+\epsilon _{ijkl},
\end{equation}
where $\bar{X}$ represents the mean of the source or source interaction.
Then the decomposition of total information ($\Sigma _{total}$) follows from
above definition as
\begin{equation}
\Sigma _{total}=\Sigma _{i}+\Sigma _{j}+\Sigma _{k}+\Sigma _{ij}+\Sigma
_{jk}+\Sigma _{ik}+\Sigma _{error}.
\end{equation}
For speech recognition, we are interested only in the phone information. So
we can rewrite the above equation as
\begin{eqnarray}
\Sigma _{total} &=&\Sigma _{p}+\tilde{\Sigma}_{error} \notag \\
&=&\frac{1}{a}\sum_{i=1}^{a}(\bar{X}_{i...}-\bar{X}_{....})^{2}+\frac{1}{abcd
}\sum_{i=1}^{a}\sum_{j=1}^{b}\sum_{k=1}^{c}\sum_{l=1}^{d}(X_{ijkl}-\bar{X}_{i...})^{2}.
\end{eqnarray}
So far, we assumed a single-state model for phone. For multiple-state phone
model, the above equation becomes
\begin{equation}
\Sigma _{total}=\frac{1}{aS}\sum_{i=1}^{a}\sum_{s=1}^{S}(\bar{X}_{i...}^{s}-\bar{X}_{....})^{2}+\frac{1}{abcdS}\sum_{i=1}^{a}\sum_{j=1}^{b} \sum_{k=1}^{c}\sum_{l=1}^{d}(X_{ijkl}^{s}-\bar{X}_{i...}^{s})^{2}
\end{equation}
where $S=$number of states. In the above equation, first term is rewritten
as
\begin{equation}
\frac{1}{aS}\sum_{i=1}^{a}\sum_{s=1}^{S}(\bar{X}_{i...}^{s}-\bar{X}_{....})^{2}
=\frac{1}{aS}\sum_{i=1}^{a}\sum_{s=1}^{S}(\bar{X}_{i...}^{s}-
\bar{X}_{i...})^{2}+\frac{1}{a}\sum_{i=1}^{a}(\bar{X}_{i...}-\bar{X} _{....})^{2}.
\end{equation}
This shows that the phone information (assuming a multiple state model) is
the sum of the phone information with a single-state model (second term) and
the average position within-phone information (first term).
\subsection{LDA}
Given that features represent phone classes, LDA \cite{fuku90} is used to
select directions that represent the maximum useful (across-class, $\Sigma
_{a}$) information while suppressing harmful (within-class, $\Sigma _{w}$)
information. We refer to these directions as linear discriminants. For
speech recognition, phone information is the useful information and other
sources represent the harmful information. Therefore, $\Sigma _{a}=\Sigma
_{p}$ and $\Sigma _{w}=\tilde{\Sigma}_{error}$. The discriminant directions ($E$) are obtained from these matrices as
\begin{equation}
E=eig({\Sigma }_{error}^{-1}\ast \Sigma _{p}).
\end{equation}
Discriminant features ($\tilde{X}$) are obtained by projecting original
features ($X$) on discriminant directions ($E$), i.e., $\tilde{X}=XE$
\section{Experimental Setup}
OGI Stories \cite{cole94} database was analyzed with ANOVA. The database
contains about 3 hours of phonetically hand-labeled conversational speech.
That represents 210 speakers, speaking for about 50 sec each, though
different telephone channels. A set of most frequently occurring 38 phonemes
from this database was used in this study.
15 critical band spectra, calculated using 25 ms hamming window at 100 Hz,
were used as features in the spectral analysis. For the temporal analysis,
each band was analyzed independently. A 101 dimensional feature vectors \cite
{sarel97} were used which were labeled by phonemes in their center.
Discriminant were derived independently in spectral and temporal domains
using features described above. Spectral discriminants were referred as
spectral bases \cite{naren98b} and temporal discriminants were referred as
FIR RASTA filters \cite{sarel97}. Discriminants were also derived from
sequential optimization in frequency and time domains as follows. First,
spectral discriminant bases were derived. The critical band spectra were
projected on 8 spectral discriminants to form new 8 dimensional feature
vectors. Temporal discriminants were designed separately for temporal
streams of these 8 features. These temporal streams were filtered using
first 3 temporal discriminants from each stream to obtain $8\ast 3=24$
features.
The discriminant features were evaluated on the continuous-digit recognition
task. Digits were modeled using 23 context independent monophone classes
that were trained using 5-state, 3-mixture HMM. The baseline system used 24
features -- 8 cepstral coefficients and their derivatives ($\Delta $s) and
double derivatives ($\Delta \Delta $s )in time. Temporal means were removed
from features in each file (cepstral mean normalization). Spectral LDA
(sLDA) system used 8 spectral discriminant features + 8 $\Delta $s + 8 $\Delta \Delta $s with mean normalization. Temporal LDA (tLDA) system used 45
temporal discriminant features. For system using joint discriminants
(sLDA+tLDA), it was empirically observed that temporal discriminants from
first stream of spectral discriminants were less noisy than those obtained
from subsequent spectral streams. We compared 1) the system with features
derived using identical discriminants (derived from the first stream) for
all 8 streams to 2) the system with features derived using different
temporal discriminants for each stream. Significantly better performance was
obtained using the system with identical discriminants. Results of such a
system are reported in this paper.
\section{Results}
\begin{figure*}[tbh]
\begin{center}
\epsfig{file=3state2.eps,width=12cm,bbllx=63,bblly=315,bburx=553,bbury=486,clip=} \\
\caption{\label{fig1}
Spectral Domain: The diagonal of across-class covariance (denoted by solid line) and
with-class covariance (denoted by dotted-line) for 3 states. Note
the higher phone information in the center state.
}
\epsfig{file=3state1.eps,width=12cm,bbllx=63,bblly=315,bburx=553,bbury=486,clip=} \\
\caption{\label{fig2}
Temporal Domain: The diagonal of across-class covariance (denoted by solid line) and
with-class covariance (denoted by dotted-line) for 3 states. Note the
anti-symmetry in the covariances for states 1 and 3.
}
\epsfig{file=3state3.eps,width=12cm,bbllx=63,bblly=315,bburx=553,bbury=486,clip=} \\
\caption{\label{fig3}
Frequency response of temporal discriminants: Discriminants using
1-state phone model (demoted by dotted-line) and discriminants using
3-state phone model (denoted by solid line).
}
\end{center}
\end{figure*}
The $\Sigma _{p}$ and $\tilde{\Sigma}_{error}$ for 3-state phone model are
shown in figures \ref{fig1} and \ref{fig2} for spectral and temporal domains
respectively. In spectral domain, discriminant information ($trace({\Sigma }_{error}^{-1}\ast \Sigma _{p})$) is highest in the center state. This is
expected as the center state represents the most stable part of the phone.
In temporal domain covariances for states 1 and 3 are asymmetric with
respect to $t=0$ due to the fact that these state are shifted in time with
respect to the center state. Using these covariances, we derived the
spectral and temporal discriminants (refer to section 2). Spectral
discriminants using multi-state phone model were similar to those derived
from 1-state phone model\footnote{Interested
reader can refer to \cite{naren98b} for a more detail discussion}. They also
performed similar to discriminants derived from 1-state phone model.
Temporal discriminants using multi-state phone model,
however, were different from those those derived from 1-state phone model.
They also performed better than discriminants from 1-state phone model.
In both domains, discriminants derived from 5-state
phone model were similar to discriminants derived from 3-state phone model
in the structure and performance.
We further investigated sequential optimization of time-frequency domains
using LDA (refer to section 3). The recognition results using these
discriminants (sLDA+tLDA) were better than baseline features (WER=6.4\%), and
discriminants obtained independently from spectral (sLDA) and temporal
(tLDA) domains. The performance was further improved to
5.5\% using discriminants derived from 3-state phone model. These
improvements are significant at $\alpha =0.05$.
\section{Conclusions}
In this work, we showed that results from ANOVA can be used to derive
discriminant features for ASR. This was done by first grouping sources
into useful and harmful sources. Then LDA was used to obtain discriminant
features. In our previous work on ANOVA and LDA, the phone was assumed to
have unimodal gaussian distribution. The phone model used for recognition,
however, is more complex. In this paper, we improved the estimate of phone
information using multi-state phone model in ANOVA. The discriminant
features using this extension were found to significantly outperform both
conventional features and discriminant features obtained from the 1-state
phone model.
\begin{thebibliography}{9}
\bibitem{sarel97} S. van Vuuren and H. Hermansky: Data-driven design of
RASTA-like filters. Proc. of EUROSPEECH, Greece (1997) 409-412
\bibitem{snh991} Sachin S. Kajarekar, N. Malayath and H. Hermansky:
Analysis of Sources of Variability in Speech. Proc. of EUROSPEECH, Budapest
(1999) 343-346
\bibitem{snh992} Sachin S. Kajarekar, N. Malayath and H. Hermansky:
Analysis of Speaker and Channel Variability in Speech. Proc. of ASRU,
Colorado (1999)
\bibitem{cole94} R. Cole and M. Noel and T. Lander: Telephone speech corpus
development at CSLU. Proc. ICSLP, (1994)
\bibitem{naren98b} H. Hermansky and N.Malayath: Spectral basis functions
from discriminant analysis Proc. of ICSLP, Sydney, (1998)
\bibitem{fuku90} K. Fukunaga: Statistical Pattern Recognition, 2nd ed.,
Academic Press, San Diego (1998)
\bibitem{ct91} Thomas M. Cover and Joy A. Thomas: Elements of Information Theory, John Wiley \& Sons, Inc (1991)
\bibitem{ht96} Robert V. Hogg and Elliot A. Tannis: Statistical Analysis and Inference, 5th ed., PRANTICE HALL (1997)
\end{thebibliography}
\end{document}