Package lib :: Module grabdata2
[frames] | no frames]

Source Code for Module lib.grabdata2

  1  #!/usr/bin/python 
  2   
  3  """This module computes the uncompressed feature vectors for 
  4  music rhythm determination. 
  5   
  6  This can be run as a script for debugging/exploratory purposes. 
  7  When run as a script, it produces plots of various stages of the 
  8  signal processing sequence. 
  9   
 10  It is also imported as a module, in which case the plotting is off. 
 11  The normal call is to make_vector(). 
 12  """ 
 13   
 14   
 15  import os 
 16  import math 
 17  import numpy    # Libraries for vector and matrix operations. 
 18  import pylab 
 19  import cPickle          # Serializing data. 
 20  import subprocess 
 21  from gmisclib import wavio 
 22  from gmisclib import cache              # Managing an on-disk cache. 
 23   
 24  from gpk_voicing import voice_misc as VM 
 25   
 26  # You could probably substitute any other feature vector module 
 27  # in gpk_voicing for the following: 
 28  from gpk_voicing import fv201105 as FV          # Compute spectrogram. 
 29   
 30  CACHEROOT = '/tmp/artdist'      #: Where to cache intermediate results. 
 31  CWT = 1.0 
 32  MEDRATE = 1.0 
 33   
34 -def s_preprocess(f, Dt=None, plotit=False):
35 """First stage. 36 This does the signal processing to (essentially) compute a spectrogram 37 of the music. The spectrogram is a two-dimensional representation of 38 the sound: time is one axis, and the other axis is frequency. 39 40 It uses a "perceptial spectrum" which is more representative of what 41 humans actually hear than the raw spectrogram. Essentially, it 42 matches the spectral bandwidth to the ear: it uses a filterbank where 43 each filter is 0.7 erb wide. Then, it half-wave rectified the signal 44 and takes the cube root, in order to match the amplitude compression 45 in the ear. 46 47 @param f: filename for the input audio file. 48 @type f: str 49 @param Dt: The desired sampling interval of the output feature vectors. (E.g. 10ms.) 50 @type Dt: float 51 @rtype: C{numpy.ndarray} 2-dimensional. 52 @return: A sequence of feature vectors that span the specified region. 53 This is a 2-dimensional spectrogram. 54 """ 55 COL = 0 #: Which column (channel) of the audio file to use? 56 fwav = '/tmp/Mrhythm.wav' #: Location for a temporary file 57 # Note that if you were doing this in parallel, the threads couldn't share the same temp file. 58 try: 59 os.remove(fwav) 60 except OSError: 61 pass 62 # First step is to convert from MP3, stereo to WAV, mono. 63 subprocess.check_call(['sox', f, fwav, 'channels', '1']) 64 x = wavio.read(fwav) #: gpkimgclass.gpk_img representation of track in memory. 65 # Ci contains information to store a copy of the feature vector on disk, so that 66 # we don't have to recompute. 67 ci = cache.cache_info(root=CACHEROOT, fname=f, info=(COL,)) 68 print 'Dt=', Dt 69 # Next comes the actual computation of the spectrogram: 70 o, descr, Dt, t0 = FV.feature_vec(x.column(COL), x.dt(), Dt, fdif2=-0.3, cache_info=ci) 71 assert len(o)==len(descr) 72 print 'Frequency bands:', [q.get('fc', 'all') for q in descr] 73 rv = numpy.transpose(numpy.array(o, numpy.float)) 74 if plotit: 75 pylab.imshow(rv) 76 pylab.show() 77 return rv
78 79 80 81 82
83 -def plot_speech(smd):
84 """Plot the acoustic data. 85 """ 86 pylab.imshow(smd.transpose(), aspect=3) 87 pylab.title("Rhythmic patterns")
88 89
90 -class faxis(object):
91 """This is a logarithmically spaced frequency axis. 92 """
93 - def __init__(self, f0, r, fmx):
94 self.f0 = f0 95 self.fmx = fmx 96 self.r = r 97 self.n = int(math.floor(math.log(fmx/f0)/r))
98
99 - def vals(self, i=None):
100 """@param i: ask for the frequency at a certain index. 101 Or, if C{i=None} (the default), yield an array of 102 all the frequencies. 103 @type i: int or None. 104 """ 105 if i == None: 106 i = numpy.arange(self.n) 107 return self.f0 * numpy.exp(i*self.r)
108 109 110
111 -def spectral(sd, Dt, plotaudio=False):
112 """Compute beat rates from a spectrogram. 113 @param sd: his starts from a spectrogram with one time axis and one frequency axis, 114 @type sd: numpy.ndarray 2-dimensional array of floats. 115 @return: It computes an array where the time axis is converted to a beat-rate axis. 116 So, the value of each element tells you how strong a particular beat 117 is, at a particular pitch. An element might reveal the strength of 118 a 60BPM beat in the high pitch range near 3 kHz (e.g. snare drum). 119 So, the frequency axis separates the track by instrument, and 120 the beats-per-minute axis lets you look at the rhythm of each instrument. 121 (Of course, that's a somewhat idealized view...) 122 @rtype: numpy.ndarray, 2-dimensional array of floats. 123 """ 124 EDGE = sd.shape[0]*Dt/3.0 125 d = VM.dataprep2_flat_generic(sd, Dt, edgewidth=EDGE, pad=sd.shape[0]*Dt, isreal=True) 126 s = numpy.absolute(numpy.fft.rfft(d, axis=0)**2) #: FFT the spectrogram along the time axis. 127 f = VM.real_fft_freq(s.shape[0], d=Dt) #: Get axis of the beat rate for the FFT. 128 df = f[1]-f[0] 129 print "df=", df, "length=", Dt*d.shape[0], Dt*sd.shape[0] 130 rv = [] 131 fa = faxis(1./4.0, 0.03, 10.0) #: Get an axis of the beat rates we want in the uncompressed feature vector. 132 stf = math.exp(fa.r/2.0) 133 # Here, we bin the data according to the fa axis. 134 # The data comes on a linear beat-rate axis (from the FFT) and we are binning 135 # it onto a log axis. Why? Partially because human perception of 136 # changes to a beat are roughly logarithmic: a 10% change in the length of 137 # a 5-second long phrase is (roughly!) as important as a 10% change in 138 # a one-second long metronome tick. 139 # More importantly, we will later want to look for various beat ratios, 140 # like 2:1 and 3:2, and that's very convenient when you use a log scale. 141 for tau in fa.vals(): 142 im = int(round(1.0/(tau*df*stf))) 143 ip = int(round(1.0/(tau*df/stf))) 144 rv.append(numpy.mean(s[im:ip+1,:], axis=0)) 145 rv = numpy.array(rv) 146 if plotaudio: 147 plot_speech(rv) 148 pylab.show() 149 return (rv, fa)
150 151
152 -def stage2(s, fa, plotit=False):
153 r = numpy.argmax(s, axis=0) 154 rate = fa.vals(r) 155 rweight = s.take(r) 156 numpy.divide(rweight, rweight.mean(), rweight) 157 if plotit: 158 pylab.subplot(311) 159 pylab.plot(rate, 'g') 160 pylab.subplot(312) 161 pylab.plot(rweight, 'y') 162 c = [] 163 k = s.shape[0] // 2 164 cwt = 1 + numpy.arange(s.shape[0]-k) 165 for i in range(s.shape[1]): 166 si = s[:,i] - numpy.median(s[:,i]) 167 c.append( numpy.correlate(si, si, mode="same")[k:] * cwt ) 168 if plotit: 169 pylab.subplot(313) 170 pylab.imshow(numpy.array(c), aspect=1) 171 pylab.show() 172 cc = numpy.array(c) 173 numpy.divide(cc, numpy.mean(cc), cc) 174 return (numpy.log(rate/MEDRATE)*rweight, cc*CWT)
175 176
177 -def make_vector(f1, name, plotaudio=False):
178 """The basic idea here is that we should not expect a song to have a consistent 179 rhythmic pattern thoughout. So, we divide the track into blocks, and 180 nside each block, we assume a consistent pattern. (Obviously, this is not 181 perfect: we chop it into blocks without regard for the song. A better scheme 182 mght be to try to find boundaries in the song where the rhythm changes.) 183 If the rhythm changes within a block, you'll get a blurred rhythmic pattern. 184 185 At the end, we report the average rhythm pattern, over all the blocks. 186 @param f1: name of the audio file 187 @type f1: str 188 @param name: the track's label. (Note: this is just passed through to the output.) 189 @type name: normally L{str} 190 @param plotaudio: display plots (true) or run silently (false). 191 This particular routine plots the averaged beat rate as a function of frequency, 192 and also the averaged feature vector. 193 @type plotaudio: boolean 194 @rtype: A L{pickle} of C{(name, feature vector)} 195 @return: It returns an opaque blob (a L{pickle}) that you can write to a file and easily read back. 196 The pickle contains the track's label, along with the uncompressed feature vector. 197 This feature vector contains the beat rate as a function of frequency, followed by 198 the part of the feature vector that describes the averaged rhythmic pattern. 199 """ 200 BLOCK_DUR = 32.0 #: The size of a block (in seconds) 201 Dt = 0.01 #: The time interval between frames (feature vectors) 202 sd = s_preprocess(f1, Dt=Dt, plotit=plotaudio) #: sequence of uncompressed feature vectors. 203 duration = sd.shape[0]*Dt #: Total duration of the track 204 nblocks = int(math.ceil(duration/(0.5*BLOCK_DUR))) 205 rsum = None #: Thi accumulates the average beat rate, block by block 206 csum = None 207 for i in range(nblocks): 208 if nblocks > 1: 209 s = i * (duration-BLOCK_DUR)/(nblocks-1) #: where does the block start (seconds)? 210 e = s + BLOCK_DUR #: where does the block end (seconds)? 211 else: 212 s = 0.0 213 e = duration 214 assert e is None or s is None or e > s 215 iis = int(round(s/Dt)) if s is not None else 0 #: Block start (in frames). 216 iie = int(round(e/Dt)) if e is not None else duration #: Block end (in frames). 217 assert iie > iis 218 print 's, e=', s, iis, e, iie 219 sdc = sd[iis:iie,:] #: This is the spectrogram for the block. 220 s, fa = spectral(sdc, Dt, plotaudio=plotaudio) 221 rate, c = stage2(s, fa) 222 if rsum is None: 223 rsum = rate 224 else: 225 numpy.add(rsum, rate, rsum) 226 if csum is None: 227 csum = c 228 else: 229 numpy.add(csum, c, csum) 230 print 'nblocks=', nblocks, csum.shape, rsum.shape 231 numpy.divide(csum, nblocks, csum) 232 numpy.divide(rsum, nblocks, rsum) 233 if plotaudio: 234 pylab.subplot(211) 235 pylab.plot(rsum, 'g') 236 pylab.subplot(212) 237 pylab.imshow(numpy.array(csum), aspect=1) 238 pylab.show() 239 return cPickle.dumps((name, numpy.concatenate((rate, c.reshape((-1,))))))
240 241 242 if __name__ == '__main__': 243 import sys 244 if len(sys.argv) <= 1: 245 raise SystemExit 246 # xyz = 'Z' if len(sys.argv)<=3 else sys.argv[3] 247 # apair(sys.argv[1], sys.argv[2], plotaudio=True, plotart=True, channel=xyz) 248 # s, fa = spectral(sys.argv[1], plotaudio=True) 249 # rate, rweight, c = stage2(s, fa, plotit=True) 250 make_vector(sys.argv[1], sys.argv[1], plotaudio=True) 251