from __future__ import division
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.io.wavfile
import scipy.signal
from IPython.display import Audio
Read a wave file, get its sampling rate and length, and display it to play. The wave data is a single dimensional array, where wave[i]
is the amplitude of frame i
. Each frame represents 1/fs
of a second.
filename = 'sa1.wav'
fs, wave = scipy.io.wavfile.read(filename)
#Note that this particular file has a single channel. Most audio files will have two (stereo) channels.
print 'Data:', wave
print 'Sampling rate:', fs
print 'Audio length:', wave.size/fs, 'seconds'
print 'Lowest amplitude:', min(wave)
print 'Highest amplitude:', max(wave)
Audio(filename)
Let's plot the wave signal. Zoom in to look at the waves at different points in time. What do you notice?
def plotwave(fs, signal, maxf=None):
"""Visualize (a segment of) a wave file."""
# maxf = maximum number of frames
frames = scipy.arange(signal.size) # x-axis
if maxf:
plt.plot(frames[:maxf], signal[:maxf])
plt.xticks(scipy.arange(0, maxf, 0.5*fs), scipy.arange(0, maxf/fs, 0.5))
plt.show()
else:
plt.plot(frames, signal)
plt.xticks(scipy.arange(0, signal.size, 0.5*fs), scipy.arange(0, signal.size/fs, 0.5))
plt.show()
plotwave(fs, wave)
Exercise: Complete the definition of downsample
. Downsampling by a factor of n
removes all but every nth sample from the original sound, and writes a new file with the same pitch as the original.
def downsample(filename, factor):
"""Lower the sampling rate by factor."""
newfilename = filename[:-4]+'-down'+str(factor)+'.wav'
fs, wave = scipy.io.wavfile.read(filename)
newfs = fs/factor
# fill in the rest
indices = range(0, wave.size, factor)
wave = wave[indices]
scipy.io.wavfile.write(newfilename, newfs, wave)
downsample('sa1.wav', 2)
downsample('sa1.wav', 4)
downsample('sa1.wav', 8)
downsample('sa1.wav', 12)
Notice that the general shape looks the same as the original. If you zoom in to the visualization, you will see the lower resolution of the signal.
fs12, wave12 = scipy.io.wavfile.read('sa1-down12.wav')
plotwave(fs12, wave12)
If we can read a wav file and store its signal, we can also create a signal and write it to a wav file. Let's generate sine waves from note frequencies in the 4th octave.
note2freq = {'C4':261.6, 'D4':293.7, 'E4':329.6, 'F4':349.2, 'G4':392.0, 'A4':440.0, 'B4':493.9, 'C5':523.3} # in Hz (waves per second)
# basic parameters
duration = .6 # in seconds
fs = 8000 # sampling rate
frames = scipy.arange(duration*fs)
amplitude = 4000
note2signal = {note: amplitude * scipy.sin(2*scipy.pi*frames*note2freq[note]/fs) for note in note2freq}
note2signal['sp'] = scipy.ones(int(duration*fs/4))
note = 'E4'
plotwave(fs, note2signal[note], 1000) # visualize first 1000 frames
scipy.io.wavfile.write('sine'+str(note)+'.wav', fs, note2signal[note]) # write to file
Audio('sine'+str(note)+'.wav')
When you know the notes to sing, you can sing most aaanythiing.
# c major scale
allnotes = scipy.hstack([note2signal[note] for note in 'sp C4 D4 E4 F4 G4 A4 B4 C5 sp'.split()])
scipy.io.wavfile.write('cscale.wav', fs, allnotes)
Audio('cscale.wav')
# twinkle twinkle little star
twinkle = scipy.hstack([note2signal[note] for note in 'sp C4 sp C4 sp G4 sp G4 sp A4 sp A4 sp G4'.split()])
scipy.io.wavfile.write('twinkle.wav', fs, twinkle)
Audio('twinkle.wav')
Listen to and load the following files that contain phonemes (units of sound) extracted from a spoken sentence, all sampled at 16kHz.
phone2signal = {}
phone2signal['AE'] = scipy.io.wavfile.read('AE.wav')[1]
Audio('AE.wav')
phone2signal['D'] = scipy.io.wavfile.read('D.wav')[1]
Audio('D.wav')
phone2signal['K'] = scipy.io.wavfile.read('K.wav')[1]
Audio('K.wav')
phone2signal['N'] = scipy.io.wavfile.read('N.wav')[1]
Audio('N.wav')
Here are some words comprised of the above phonemes:
word_pronunciations = {'dank': ['D', 'AE', 'N', 'K'],
'cad': ['K', 'AE', 'D'],
'knack': ['N', 'AE', 'K'],
'and': ['AE', 'N', 'D']}
Exercise: Following the note sequences examples, write a function that will return an array representing the waveform of a given word.
def pronounce(word):
# define
return scipy.hstack([phone2signal[phone] for phone in word_pronunciations[word]])
for word in word_pronunciations:
scipy.io.wavfile.write(word+'.wav', 16000, pronounce(word))
Listen to the files just created. How do they sound?
Given a sound, we apply Fourier Analysis to decompose it into its frequencies, and visualize them.
Let's start with the simple sine wave for the 'E4' note.
def fftplot(fs, signal):
size = signal.size
fftresult = abs(scipy.fft(signal)/size)
freqs = scipy.arange(size)*fs/size
halfsize = int(size/2)
plt.plot(freqs[:halfsize], fftresult[:halfsize])
plt.show()
fftplot(8000, note2signal['E4'])
'E4' as a single simple note had only one frequency for the whole duration. Most sounds have different frequencies at different points of time. We'll visualize these as spectrograms, with time on the x-axis, frequency on the y-axis, and the intensity of a given frequency at the given time step noted by shading.
def spectrogram(wavfile):
fs, wave = scipy.io.wavfile.read(wavfile)
spec = plt.specgram(wave, NFFT=int(fs*0.005), Fs=fs, cmap=plt.cm.gray_r, pad_to=256, noverlap=int(fs*0.0025))
plt.show()
spectrogram('sa1.wav')
spectrogram('twinkle.wav')
sa1.wav
is from the TIMIT corpus, which contains recordings of several speakers saying short sentences. (Part of this corpus is available from NLTK.) Locations of every phoneme segment are manually annotated.
I have extracted the speech clips corresponding to each vowel sound in every recording, and used Fourier analysis to locate the first two formants for each clip. These formants are stored in male.formants.csv
and female.formants.csv
.
Here's some code that plots the means of these formants and the average for each sex. What do you notice?
from collections import defaultdict
def plotformants(filename, color):
data = [line.split(',') for line in open(filename).readlines()]
data = [(float(line[1]), float(line[2]), line[0]) for line in data if len(line)==3]
f1all = defaultdict(list)
f2all = defaultdict(list)
for f1, f2, vowel in data:
f1all[vowel].append(f1)
f2all[vowel].append(f2)
means = {}
for vowel in f1all:
means[vowel] = (scipy.average(f1all[vowel]), scipy.average(f2all[vowel]))
means['AVG'] = (scipy.average([val[0] for val in means.values()]), scipy.average([val[1] for val in means.values()]))
f1, f2 = zip(*means.values()) # f1, f2 list
plt.scatter(f2, f1, s=30, c=color)
for vowel in means:
plt.annotate(vowel, (means[vowel][1]+0.01, means[vowel][0]+0.01))
plt.axis([2500, 800, 900, 300])
plotformants('female.formants.csv', 'red')
plotformants('male.formants.csv', 'blue')
plt.show()