function y=flexdct(x, type)
% FLEXDCT           Compute DCT I, II, III and IV
%
%      Synopsys:
%
%            Y=FLEXDCT(X, TYPE)
%
%      Parameters:
%
%           X    = Input data
%           TYPE = DCT type
%
%      Description:
%
%           Compute the DCT of input vector X.  If X is a matrix,
%           it proceeds columnwise.  TYPE select the DCT type it
%           can assume values 'i', 'ii', 'iii', 'iv' or 1, 2, 3, 4.
%
%           NOTE: On my PC this function is faster than direct
%           computation for sizes >= 512.  If you need to compute
%           smaller DCTs you can use this function as follows
%
%                 mtx = flexdct(eye(N), TYPE);
%                   y = mtx*x;
%
%      Defaults:
%
%           NONE
%
%      See also: 
%
%           NONE
%

%
% Tested on 23/9/2005 by comparing flexdct(eye(8), type) with the
% DCT matrices. 
%

%%
%% Default handling
%%

%
% Call parsing
%

if (nargin < 2)
  error('Usage: OUTPUT=FLEXDCT(INPUT, TYPE)')
end

if (ischar(type))
  switch lower(type)
   case 'i'
    type=1;
   case 'ii'
    type=2;
   case 'iii'
    type=3;
   case 'iv'
    type=4;
   otherwise 
    error(['Unrecognized type ' type]);
  end
elseif (~isreal(type))
  error('TYPE must be a number or a string')
end

if (type ~= fix(type) || type < 1 || type > 4)
  error('TYPE must be an integer between 1 and 4');
end


%%
%% True code
%%



%
% The definition of the four DCT types are (m=0..n-1)
%
% DCT I
%   y(m) = (1/2)(x(0)+(-1)^m) + 
%             \sum_{k=1}^{n-1} x(k) cos((pi/(N-1))*m*k)
%
% DCT II
%   y(m) = \sum_{k=0}^{n-1} x(k) cos((pi/N)*m*(k+1/2))
%
% DCT III
%   y(m) = x(0)/2 + \sum_{k=1}^{n-1} x(k) cos((pi/N)*(m+1/2)*k)
%
% DCT IV
%   y(m) = \sum_{k=0}^{n-1} x(k) cos((pi/N)*(m+1/2)*(k+1/2))
%
%
% The main part of the algorithm will be the computation of sums
% like
%
%      \sum_{k=0}^{N-1} a(k) cos((pi/N)*(m+alpha)*(k+beta))   (+)
%
% The trick is to observe that (+) is the real part of 
%
%   \sum_{k=0}^{N-1} a(k) exp(-j*(pi/N)*(m+alpha)*(k+beta)) 
%     = \sum_{k=0}^{N-1} a(k) exp(-j*(2*pi/2*N)*(m+alpha)*(k+beta)) 
%     = \sum_{k=0}^{N-1} a(k) W_{2N}^(m*k + m*beta + k*alpha + alpha*beta)
%     = W_2N^{m*beta}
%           \sum_{k=0}^{N-1} [a(k) W_2N^(k*alpha+alpha*beta)] W_{2N}^(m*k)
%     = W_2N^{m*beta}
%           \sum_{k=0}^{2*N-1} b(k) W_{2N}^(m*k)
%
% where
%
%            [a(k) W_2N^(k*alpha+alpha*beta)]      if  k=0..N-1
%
%   b(k)= 
%            0                                     if  k=N..2*N-1
%
%
% Summarizing, the algorithm to compute (+) is
%
%  1. Create signal b(k)
%
%  2. Compute the FFT of b(k)
%
%  3. Multiply the result by W_2N^{m*beta}
%
%  4. Finally, take the real part of the result
%
% In the case of DCT-II and DCT-IV the algorithms above compute the
% desired DCT; in the case of DCT-III one must replace x(0) with 0
% and add x(0)/2 to the final result
%
% In order to save still some more computations when N is even, one
% could think of computing the FFT of b(k) by means of a
% Cooley-Tukey FFT.  The first stage of the CT-FFT computes the
% "butterflies" 
%
%         x_+(n) =  x(n) + x(n+N)
%
%         x_-(n) = (x(n) - x(n+N))*W_{2N}^{n}
%
%
% Succesively the CT-FFT computes the FFT of x_+ and x_- in order
% to obtain the odd and even values of X(k).
%
% 
% Please note that although this algorithm is more efficient than
% direct computation, it is not the most efficient one.  
%
%

%
% Take care of the dimensions of x
%
[nr,nc]=size(x);

if (nr==1 | nc==1)
  %
  % If x is a vector, force it to column
  %
  x=x(:);
  N = max(nr, nc);
else
  %
  % If x is a matrix, operate columnwise
  %
  N = nr;
end

if (N<2)
  error('Cannot take the DCT of a single sample');
end


persistent old_type old_size 
persistent pre_proc pre_proc2 post_proc
persistent alpha beta

if (isempty(old_type) || old_type ~= type || old_size ~= N)
  %
  % Find the values of alpha and beta corresponding to the desired
  % type. 
  %
  switch (type)
   case 1
    error('DCT-I not implemented yet (sorry)')
    
   case 2
    beta  = 1/2;
    alpha = 0;
  
   case 3
    beta  = 0;    
    alpha = 1/2;  
   
   case 4
    beta  = 1/2;
    alpha = 1/2;
  end
  
  %
  % 2N-th root of unit
  %
  W2N = exp(-j*2*pi/(2*N));
  
  idx = alpha*(beta+(0:N-1));  
  u   = W2N .^ idx;
  v   = W2N .^ (0:(N-1));
  pre_proc  = sparse(1:N, 1:N, u);
  pre_proc2 = sparse(1:N, 1:N, u .* v);

  idx = beta*(0:N-1);
  post_proc = sparse(1:N, 1:N, W2N .^ idx);
  
  old_type = type;
  old_size = N;
end

%
% Special attention is needed for DCT-III
%
if (type == 3)
  x0 = x(1,:);     %% Save it for later
  x(1,:) = 0;
end



%
% Do preprocessing if needed
%
if (alpha ~= 0)
  x = pre_proc*x;
end

x = fft(x, 2*N);           % Compute the FFT
x = x(1:N,:);              % We need only the first N values

%
% Do postprocessing if needed
%
if (beta ~= 0)
  x = post_proc*x;
end

%
% Extract the real part.  This concludes DCT-II and DCT-IV
%
y = real(x);

%
% If DCT-III is required, add x(0)/2
%
if (type == 3)
  y = y+ones(N,1)*x0/2;
end

%
% Finally, force the output to the same dimensions of the input
%
y = reshape(y, nr, nc);


