\documentclass[10pt]{article}
\usepackage{graphicx}
\begin{document}
\title
{
\Large\bf
Speaker Identification Using Autoregressive Hidden Markov
Models and Adaptive Vector Quantization
}
\author
{
Eugeny E. Bovbel,
\and Igor E.Kheidorov,
\and Michael E.Kotlyar
}
\date
{
\small
Dept. of Radiophysics, Belarussian State University\\
F. Scoriny, 4, 220050, Minsk, Belarus\\
tel, fax. (0172) 770-890\\
e-mail: ikheidorov@poboxes.com\\
20 May 2000
}
\maketitle
\begin{abstract}
\small
Wide-frequency spectral analysis, autoregressive hidden Markov models (ARHMM)
and self-organizing neural networks (SOM) have been used for high accuracy
speaker features modelling. The initial ARHMM parameters estimation based
on Kalman filter is proposed. The five-keyword speaker identification system
has been built and tested. The experiments show that this approach provides
high accuracy of speaker identification even if the same words are pronounced
by different speakers.
\end{abstract}
\section {Introduction}
By its nature speech signal has two aspects.
Firstly, it is highly dependent on physical parameters of vocal tract.
Secondly, the speech is produced under neural control
of the human, so that it is affected by the person-specific characteristics
such as accent, pronunciation, speed, timbre etc. It gives us the possibility
to use these features for speaker identification.
We model the person-specific characteristics taking into consideration
the speech signal parameters interrelation between close frames.
This approach provides us with the useful information about pronunciation mode.
From this viewpoint, statistical methods based on autoregressive HMMs
are very attractive, because the standard HMM does not care about
temporary close observations interrelation, which makes valued information
about acoustical structure of the phoneme loose.
\section {Autoregressive hidden Markov models}
The speech utterance can be modelled by the sequence of the
discrete stationary states with immediate transitions between them.
Let $\overline{O}=(O_{1},O_{2},...,O_{N})$ be the observation vector sequence
for the given utterance, and $\overline{q}=(q_{1},q_{2},...,q_{N})$ be the
appropriate HMM states sequence. $O_{n}=(x_{n,1},x_{n,2},...,x_{n,K})$
- $n$th $K$-dimensional observation vector.
For each state $q$ it is necessary to define distribution of
observation vectors $b_{q}(O)$.
We assume that the vector sequence $\overline{O}$ components
conform the $P$-order autoregessive model:
\begin{equation} \label{1}
x_{n,k}=-\sum_{i=1}^{P}a_{k,i}x_{n-i,k}+\epsilon_{k}
\end{equation}
where $\epsilon_{k}$ - independent zero-mean Gaussian random values
with dispersion $\sigma^{2}$; $a_{k,i}$ - autoregression coefficients
(linear prediction coefficents) for $k$th component of observation vector.
With this assumption for each state $q$ we can estimate observation vector
components distribution as following:
$$b_{q}(x_{n,k}|x_{n-1,k},x_{n-2,k},...,x_{n-P,k})=$$
\begin{equation} \label{2}
={1 \over \sqrt{2\pi\sigma_{q}^{2}}}\exp{\left(-{1\over 2\sigma_{q}^{2}}(x_{n,k}+\sum_{i=1}^{P}a_{k,i,q}x_{n-i,k})^{2}\right)}
\end{equation}
So, the autoregressive hidden Markov model is a model of a twice stochastic
process: the model state sequence is first-order Markov process,
and the observation vectors sequence is random process too and
is modelled by $P$-order autoregression.
\section {Initial estimation of ARHMM parameters}
We have used Kalman filter to estimate initial autoregression parameters
for ARHMM. Consider linear system. States of the system in moment $t$ and in
moment $t-1$ are constrained by the system equation:
\begin{equation} \label{system_equation}
q_{t}=Fq_{t-1}+N_{t}
\end{equation}
where $F$ - linear operator, $N$ - system noise.
Let $q_{t}$ - linear system state in moment $t$. At every moment $t$ we
perform measuring of system output $y_{t}$, wich is function of system state:
\begin{equation} \label{measuring_equation}
y_{t}=Hq_{t}+G_{t}
\end{equation}
where $H$ - measuring operator, $G_{t}$ - measuring noise. If we suppose
$q_{t}=(a_{1},a_{2},...,a_{P})$, $y_{t}=x_{t,k}$,
we can rewrite \ref{system_equation},\ref{measuring_equation} as following:
\begin{equation}
q_{t}=q_{t-1}+N_{t}
\end{equation}
\begin{equation}
y_{t}=H_{t}q_{t}+G_{t}
\end{equation}
\begin{equation}
H_{t}=-diag(y_{t-1},y_{t-2},...,y_{t-P})
\end{equation}
Using Kalman algorithm, if $y_{t}$ sequence is given, we can estimate system
parameters $q$. The detailed description of Kalman filter can be found in [6].
\section {Experiment}
Five belarussian words spoken by 5 speakers have been sampled at 44kHz rate
in different environmental conditions during 3 months.
130 samples per word per dictor have been taken, 3250 samples totally.
Samples have been separated in two nonintersecting sets: the training and
the testing one.
Mel-cepstrum and delta-cepstrum have been used as observation vectors.
Separated SOM-codebooks and mixed discrete-continuous ARHMMs have been built
for cepstrums and delta-cepstrums. Probabilities given by corresponding
cepstr- and delta-cepstr ARHMMS were multiplied. Two codebooks (for cepstrums
and delta-cepstrums) were trained on all vectors made from training set.
Then, ARHMM pairs have been trained on the training set - one ARHMM pair per
word per dictor, 25 model pairs totally.
First series of experiments have been carried out to determint the optimal
codebook size (see Fig.1).\\
\scalebox{0.8}{
\includegraphics{fig1.eps}
}
$$Fig.1$$
Training set consisted of 5 samples per word per
speaker, remaining samples were used for testing. As we can notice, optimal
codebook size is about 139, which is rather much as usual in
word-recognition tasks.
Second series of experiments was designed to estimate our speaker recognition
system's overall performance (see Fig.2).\\
\scalebox{0.8}{
\includegraphics{fig2.eps}
}
$$Fig.2$$
Training set consisted of 30 samples
per word per speaker, remaining samples were used for testing. As we can see,
overall recodnition performance, when every speaker pronounces the sequence of
the same 5 words, is close to 100\%.
\begin{thebibliography}{b1}
\bibitem{b1}
D.A. Reynolds, Speaker identification and verification using Gaussian
mixture speaker models, Speech Comunication, ¹17(1-2), pp.91-108, 1995
\bibitem{b2}
H. A. Bourlard and N. Morgan Connectionist Speech recognition: A Hybrid
Approach. Kluwer Academic Publishers, Boston M.A., 1994
\bibitem{b3}
B.H. Juang, L.R.Rabiner, Mixture autoregressive hidden Markov
models for speech signals, IEEE Trans. ASSP-33, ¹6, pp.1404-1412, 1985
\bibitem{b4}
E.I.Bovbel, I.E.Kheidorov, P.P.Tkachova, The analysis of speaker
individual features based on autoregressive hidden Markov models,
Proc. of EUROSPEECH'99, vol.3, pp.1191-1194, September 5-9, 1999,
Budapest, Hungary.
\bibitem{b5}
T.Kohonen, The self-organizing map, Proceedings of IEEE, vol.78, 1990. pp. 1464 - 1480.
\bibitem{b6}
Youji Iiguni, A real-time learning algorithm for a multilayered neural network based on the extended Kalman filter, IEEE Transactions on Signal Processing, vol.40, April 1992, pp.959-966.
\end{thebibliography}
\end{document}