hachures-tourneurs
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

ecantorix.pl
text/x-perl

Download raw (27.1 KB)

#!/usr/bin/perl
#
#   eCantorix - singing speech synthesis using eSpeak
#   Copyright (C) 2012  Rudolf Polzer
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.

use FindBin;
use lib $FindBin::Bin;

use strict;
use warnings;
use MIDI;
use URI::Escape;
use Math::FFT;
use Cwd;
use Getopt::Long;
use Digest::SHA;
use eCantorix::Util;

# from wavegen.cpp
# set from y = pow(2,x) * 128,  x=-1 to 1
our @pitch_adjust_tab = (
    64, 65, 66, 67, 68, 69, 70, 71,
    72, 73, 74, 75, 76, 77, 78, 79,
    80, 81, 82, 83, 84, 86, 87, 88,
    89, 91, 92, 93, 94, 96, 97, 98,
   100,101,103,104,105,107,108,110,
   111,113,115,116,118,119,121,123,
   124,126,128,130,132,133,135,137,
   139,141,143,145,147,149,151,153,
   155,158,160,162,164,167,169,171,
   174,176,179,181,184,186,189,191,
   194,197,199,202,205,208,211,214,
   217,220,223,226,229,232,236,239,
   242,246,249,252, 254,255
);

# these variables can be overridden using the same syntax from a control file
# e.g. to be able to use a different speaker voice

# voice parameters
our $ESPEAK_VOICE = "default";
our $ESPEAK_VOICE_PATH;
our $ESPEAK_TRANSPOSE = -24;
our $ESPEAK_USE_PITCH_ADJUST_TAB = 0;

# filtering
our $SOX_RATE = 22050;
our $SOX_PREEFFECTS;
our $SOX_PREEFFECTS_CHANNELS = 1;
our $SOX_AFTEREFFECTS;
our $SOX_AFTEREFFECTS_CHANNELS = 1;

# paths
our $ESPEAK_CACHE = ".";
our $ESPEAK_TEMPDIR = undef; # use $ESPEAK_CACHE/tmp by default
our $ESPEAK_CACHE_PREFIX = "";

# input syllable editing (perl expression or sub operating on $_)
our $EDIT_SYLLABLES;

# hint: $EDIT_SYLLABLES = sub { s/^\s*\[\[\s*(.*)\s*\]\]\s*$/$1/s or s/^\s*(.*?)\s*$/[[$1]]/s; };
# changes the parsing to [[words]] and phonetics, instead of the default (words and [[phonetics]])

# output
our $OUTPUT_FORMAT = 'wav';
our $OUTPUT_FILE = '-';

# if $OUTPUT_FORMAT is 'mid':
our $OUTPUT_MID_PREFIX = 'vocals:';

# tools (usually need no changes here)
our $ESPEAK = 'espeak -v "$VOICE" ${VOICE_PATH:+--path="$VOICE_PATH"} -z -p "$PITCH" -s "$SPEED" -a "$VELOCITY" -w "$OUT" -m "<prosody range=\"0\"> $SYLLABLE </prosody>"';
our $SOX_PROCESS_IN_TO_S16LE = 'sox "$IN" -t raw -r "$RATE" -e signed -b 16 -c $PREEFFECTS_CHANNELS - remix - $PREEFFECTS silence 1 1s 0 reverse silence 1 1s 0 reverse';
our $SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT = 'sox -t raw -r "$RATE" -e signed -b 16 -c $PREEFFECTS_CHANNELS - "$OUT" tempo -s "$TEMPO" $PITCHBEND $AFTEREFFECTS';
our $SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT_USES_PITCH = 0;
our $SOX_TEMPO_MIN = 0.1;
our $SOX_TEMPO_MAX = 100;

# espeak tool options (normally don't touch this)
our $ESPEAK_ATTEMPTS = 8;
our $ESPEAK_PITCH_MIN = 0;
our $ESPEAK_PITCH_START = 50;
our $ESPEAK_PITCH_MAX = 99;
our $ESPEAK_SPEED_MIN = 80;
our $ESPEAK_SPEED_START = 175;
our $ESPEAK_SPEED_MAX = 450;
our $ESPEAK_VELOCITY_MIN = 0;
our $ESPEAK_VELOCITY_NORMAL = 100;
our $ESPEAK_VELOCITY_MAX = 200;

# pitch caching (normally don't touch this)
our $ESPEAK_PITCH_CACHE = 1;
our $ESPEAK_PITCH_CACHE_SPEEDS = 5;
our $ESPEAK_PITCH_FACTOR; # this can be a sub that calculates the pitch change

# pitch calculating (normally don't touch this)
our $ANALYZE_MINFREQ = 20;
our $ANALYZE_MAXFREQ = 500;
our $ANALYZE_BIAS = 1.4;
our $ANALYZE_RANGE1 = 2;
our $ANALYZE_RANGE2 = 16;
our $ANALYZE_ENV = 3;

# ASS output
our $ASS_PRETIME = 1.5;
our $ASS_POSTTIME = 0.5;
our $ASS_LENGTHFACTOR = 1.0;
# end of customizable variables

# some options make sense overriding from the command line
Getopt::Long::Configure("gnu_getopt", "auto_help", "auto_version");
GetOptions(
	'control-file|C=s' => sub { do $_[1]; },
	'voice|v=s' => \$ESPEAK_VOICE,
	'transpose|t=s' => sub { $ESPEAK_TRANSPOSE += $_[1]; },
	'rate|r=s' => \$SOX_RATE,
	'pre-effects|P=s' => \$SOX_PREEFFECTS,
	'after-effects|A=s' => \$SOX_AFTEREFFECTS,
	'cache|c=s' => \$ESPEAK_CACHE,
	'output-format|O=s' => \$OUTPUT_FORMAT,
	'output|o=s' => \$OUTPUT_FILE,
	'output-mid-prefix=s' => \$OUTPUT_MID_PREFIX,
	'edit-syllables=s' => \$EDIT_SYLLABLES,
	'dry-run|n' => sub { $OUTPUT_FORMAT = undef; },
);

my ($filename) = @ARGV;

$EDIT_SYLLABLES = eval "sub { $EDIT_SYLLABLES; }"
	if defined $EDIT_SYLLABLES and not ref $EDIT_SYLLABLES;

if(!defined $ESPEAK_PITCH_FACTOR and $ESPEAK_USE_PITCH_ADJUST_TAB)
{
	# works only for voices with zero range
	$ESPEAK_PITCH_FACTOR = sub { return $pitch_adjust_tab[$_[0]]; };
}

$ESPEAK_CACHE = Cwd::abs_path($ESPEAK_CACHE);
$ESPEAK_TEMPDIR //= "$ESPEAK_CACHE/tmp";
mkdir "$ESPEAK_TEMPDIR";

my $statusline_active = 0;
sub statusline(@)
{
	if($statusline_active)
	{
		print STDERR "\e[A\e[K";
	}
	print STDERR @_, "\n";
	$statusline_active = 1;
}
sub statusout(@)
{
	print STDERR @_;
	$statusline_active = 0;
}

my $opus = MIDI::Opus->new({from_file => $filename});

sub tick2lmms($)
{
	return int(48 / $opus->ticks() * $_[0] + 0.5);
}
my %out = (
	mmp => {
		header => sub
		{
			my ($self, $totalticks, $totaltime, $tempi) = @_;
			open $self->{fh}, ($OUTPUT_FILE eq "-" ? ">-" : (">", $OUTPUT_FILE))
				or die "open $OUTPUT_FILE: $!";

			print { $self->{fh} } <<EOF;
<?xml version="1.0"?>
<!DOCTYPE multimedia-project>
<multimedia-project version="1.0" creator="Linux MultiMedia Studio (LMMS)" creatorversion="0.4.13" type="song">
	<head timesig_numerator="4" mastervol="100" timesig_denominator="4" masterpitch="0">
		<bpm value="120" id="3481048"/>
	</head>
	<song>
		<trackcontainer width="600" x="5" y="5" maximized="0" height="300" visible="1" type="song" minimized="0">
EOF
			if(@$tempi)
			{
				my $lmms_tempolen = tick2lmms($tempi->[-1][0]) + 1;
				print { $self->{fh} } <<EOF;
			<track muted="0" type="5" name="Automation track">
				<automationtrack/>
				<automationpattern name="Tempo" pos="0" len="$lmms_tempolen">
EOF
				for(@$tempi)
				{
					my $lmms_tick = tick2lmms($_->[0]);
					# x [sec/tick]
					# x/60 [min/tick]
					# 60/x [tick/min]
					# (60/$opus->ticks())/x [quarter/min]
					my $lmms_tempo = int((60 / $opus->ticks()) / $_->[1] + 0.5);
					print { $self->{fh} } <<EOF;
					<time value="$lmms_tempo" pos="$lmms_tick"/>
EOF
				}
				print { $self->{fh} } <<EOF;
					<object id="3481048"/>
				</automationpattern>
			</track>
EOF
			}
			print { $self->{fh} } <<EOF;
			<track muted="0" type="2" name="Sample track">
				<sampletrack vol="100">
					<fxchain numofeffects="0" enabled="0"/>
				</sampletrack>
EOF
		},
		sample => sub {
			my ($self, $tick, $dtick, $time, $dtime, $outname) = @_;
			my $lmms_tick = tick2lmms($tick);
			my $lmms_dtick = tick2lmms($tick + $dtick) - $lmms_tick;
			print { $self->{fh} } <<EOF;
				<sampletco muted="0" pos="$lmms_tick" len="$lmms_dtick" src="$outname" />
EOF
		},
		footer => sub {
			my ($self) = @_;
			print { $self->{fh} } <<EOF;
			</track>
		</trackcontainer>
	</song>
</multimedia-project>
EOF
		}
	},
	mid => {
		header => sub {
			my ($self, $totalticks, $totaltime, $tempi) = @_;
			$self->{opus} = MIDI::Opus->new();
			$self->{opus}->format(1);
			$self->{opus}->ticks($opus->ticks());
			my $metatrack = MIDI::Track->new();
			$metatrack->events(reltime map {
				['set_tempo', $_->[0], $_->[1] * $opus->ticks() / 0.000001]
			} @$tempi);
			my $notetrack = MIDI::Track->new();
			$self->{opus}->tracks($metatrack, $notetrack);
		},
		sample => sub {
			my ($self, $tick, $dtick, $time, $dtime, $outname) = @_;
			my $track = $self->{opus}->tracks_r()->[-1];
			$outname =~ s/^.*\///;
			$track->new_event(
				'text_event', $tick, "$OUTPUT_MID_PREFIX$outname");
		},
		footer => sub {
			my ($self) = @_;
			my $track = $self->{opus}->tracks_r()->[-1];
			$track->events(reltime $track->events());
			$self->{opus}->write_to_file($OUTPUT_FILE);
			delete $self->{opus};
		}
	},
	wav => {
		header => sub {
			my ($self, $totalticks, $totaltime, $tempi) = @_;
			$self->{buf} = [];
		},
		sample => sub {
			my ($self, $tick, $dtick, $time, $dtime, $outname) = @_;
			my $sample_start = int($time * $SOX_RATE) * $SOX_AFTEREFFECTS_CHANNELS;
			my $cmd = 'sox "$IN" -t raw -r "$RATE" -e signed -b 16 -c $AFTEREFFECTS_CHANNELS -';
			my $fh = do
			{
				local $ENV{IN} = $outname;
				local $ENV{RATE} = $SOX_RATE;
				local $ENV{AFTEREFFECTS_CHANNELS} = $SOX_AFTEREFFECTS_CHANNELS;
				open my $fh, '-|', "$cmd"
					or die "$cmd: $!";
				$fh;
			};
			my $data = do { undef local $/; <$fh>; };
			close $fh
				or die "$cmd: $! $?";
			die "$cmd: No data"
				unless length $data;
			my @samples = unpack "s*", $data;
			my $b = $self->{buf};
			$b->[$_ + $sample_start] += $samples[$_]
				for 0..@samples-1;
		},
		footer => sub {
			my ($self) = @_;
			my $clip = 0;
			for(@{$self->{buf}})
			{
				$_ //= 0;
				if($_ > 32767)
				{
					++$clip;
					$_ = 32767;
				}
				if($_ < -32768)
				{
					++$clip;
					$_ = -32768;
				}
			}
			statusout "$clip clipped samples\n"
				if $clip > 0;
			statusout "writing...\n";
			my $cmd = 'sox -t raw -r "$RATE" -e signed -b 16 -c $AFTEREFFECTS_CHANNELS - -t wav "$OUT"';
			my $fh = do
			{
				local $ENV{OUT} = $OUTPUT_FILE;
				local $ENV{RATE} = $SOX_RATE;
				local $ENV{AFTEREFFECTS_CHANNELS} = $SOX_AFTEREFFECTS_CHANNELS;
				open my $fh, '|-', "$cmd"
					or die "$cmd: $!";
				$fh;
			};
			print $fh pack "s*", @{$self->{buf}};
			close $fh
				or die "$cmd: $! $?";
		}
	}
);
my $out = defined $OUTPUT_FORMAT ? $out{$OUTPUT_FORMAT} : undef;
my $out_self = {};

sub getfirstmaximum($$$)
{
	my ($correl, $mi, $ma) = @_;

	# we're looking for the first "big" maximum in the range

	my $best = $mi;
	my $bestscore = 0;
	for($mi .. $ma)
	{
		my $s = $correl->[$_];
		my $dist = $_ - $best;
		my $bias = $ANALYZE_BIAS;
		if($dist <= $ANALYZE_RANGE1)
		{
			$bias = 1;
		}
		elsif($dist < $ANALYZE_RANGE2)
		{
			$bias = ($bias - 1) * (($dist - $ANALYZE_RANGE1) / ($ANALYZE_RANGE2 - $ANALYZE_RANGE1)) + 1;
		}
		if($s > $bestscore * $bias)
		{
			$best = $_;
			$bestscore = $s;
		}
	}

	# now $best points APPROXIMATELY to the frequency... but is not correct
	# do a second search for a maximum around $best

	$mi = int($best * 0.67)
		if $mi < int($best * 0.67);
	$ma = int($best * 1.50)
		if $ma > int($best * 1.50);

	for($mi .. $ma)
	{
		my $s = $correl->[$_];
		if($s > $bestscore)
		{
			$best = $_;
			$bestscore = $s;
		}
	}

	return $best;
}

sub getpitch($$$$)
{
	my ($data, $mi, $ma, $debugfile) = @_;

	# pad for autocorrelation to work
	my $n = @$data;
	my $n2 = $n;
	--$n2;
	$n2 |= $n2 >> 16;
	$n2 |= $n2 >> 8;
	$n2 |= $n2 >> 4;
	$n2 |= $n2 >> 2;
	$n2 |= $n2 >> 1;
	++$n2;
	my @data = (@$data, (0) x (2 * $n2 - $n));
	my $fft = new Math::FFT(\@data);

	my $correl = $fft->correl($fft);

	# EAC algorithm
	# 1. clip off values below zero
	for(0..@$correl-1)
	{
		$correl->[$_] = 0
			if $correl->[$_] < 0;
	}
	# 2. subtract a time-doubled signal
	for(reverse 1..@$correl/2-1)
	{
		if($_ % 2)
		{
			$correl->[$_] -= ($correl->[($_ - 1) / 2] + $correl->[($_ - 1) / 2]) / 2;
		}
		else
		{
			$correl->[$_] -= $correl->[$_ / 2];
		}
		# 3. clip off values below zero again
		$correl->[$_] = 0
			if $correl->[$_] < 0;
	}

	$ma = int(@$correl / 2 - 1)
		if $ma > int(@$correl / 2 - 1);

	my $best = getfirstmaximum($correl, $mi, $ma);

	my $sf = 0;
	my $s = 0;
	for($best-$ANALYZE_ENV .. $best+$ANALYZE_ENV)
	{
		$s += $correl->[$_];
		$sf += $_ * $correl->[$_];
	}

	my $ret = $sf / $s;

	if(defined $debugfile)
	{
		open my $fh, ">", $debugfile;
		for(0..@$correl-1)
		{
			my $valid = ($_ >= $mi && $_ <= $ma);
			my $diff = ($_ - $ret);
			print $fh "$_ $correl->[$_] $diff $valid\n";
		}
		close $fh;
	}

	return $ret;
}

sub get_voice_sample($$$$)
{
	my ($pitch, $velocity, $speed, $syllable) = @_;
	{
		local $ENV{OUT} = $ESPEAK_TEMPDIR . "/espeak_raw.wav";
		local $ENV{PITCH} = $pitch;
		local $ENV{VELOCITY} = $velocity;
		local $ENV{SPEED} = $speed;
		local $ENV{VOICE} = $ESPEAK_VOICE;
		local $ENV{VOICE_PATH} = $ESPEAK_VOICE_PATH;
		local $ENV{SYLLABLE} = $syllable;
		local $ENV{TEMP} = $ESPEAK_TEMPDIR;
		0 == system $ESPEAK
			or die "$ESPEAK: $! $?";
	}
	my $fh = do
	{
		local $ENV{PREEFFECTS} = $SOX_PREEFFECTS;
		local $ENV{PREEFFECTS_CHANNELS} = $SOX_PREEFFECTS_CHANNELS;
		local $ENV{RATE} = $SOX_RATE;
		local $ENV{IN} = $ESPEAK_TEMPDIR . "/espeak_raw.wav";
		local $ENV{TEMP} = $ESPEAK_TEMPDIR;
		open my $fh, '-|', $SOX_PROCESS_IN_TO_S16LE
			or die "$SOX_PROCESS_IN_TO_S16LE: $!";
		$fh;
	};
	my $data = do { undef local $/; <$fh>; };
	close $fh
		or die "$SOX_PROCESS_IN_TO_S16LE: $! $?";
	return $data;
}

my @pitch_cache = ();
sub get_pitch_cached($);
sub get_pitch_cached($)
{
	my ($pitch) = @_;
	if($ESPEAK_PITCH_FACTOR)
	{
		if($pitch != $ESPEAK_PITCH_START)
		{
			my $base = get_pitch_cached($ESPEAK_PITCH_START);
			my $f = $ESPEAK_PITCH_FACTOR->($pitch) / $ESPEAK_PITCH_FACTOR->($ESPEAK_PITCH_START);
			return $f * $base;
		}
	}
	return $pitch_cache[$pitch]
		if exists $pitch_cache[$pitch];
	my $cache = sprintf "%s/%s_%s_%d.pitch",
		$ESPEAK_CACHE, $ESPEAK_CACHE_PREFIX, $ESPEAK_VOICE, $pitch;
	if(open my $fh, "<", $cache)
	{
		my $ret = <$fh> + 0;
		close $fh;
		return $ret;
	}
	my @hz = ();
	statusout "Caching pitch $pitch... ";
	for my $syllable(qw/do re mi fa so la ti/)
	{
		for(0..($ESPEAK_PITCH_CACHE_SPEEDS - 1))
		{
			my $speed = int(0.5 + $ESPEAK_SPEED_MIN + ($ESPEAK_SPEED_MAX - $ESPEAK_SPEED_MIN + 1) * (($_ + 0.5) / $ESPEAK_PITCH_CACHE_SPEEDS));
			my $data = get_voice_sample $pitch, $ESPEAK_VELOCITY_NORMAL, $speed, $syllable;
			die "$SOX_PROCESS_IN_TO_S16LE: No data"
				unless length $data;
			my @data = unpack "s*", $data;
			my $thishz = $SOX_RATE / getpitch(\@data, $SOX_RATE / $ANALYZE_MAXFREQ, $SOX_RATE / $ANALYZE_MINFREQ, undef);

			push @hz, $thishz;
		}
	}
	my $median = [sort { $a <=> $b } @hz]->[(@hz - 1) / 2];
	my $above = scalar grep { $_ >= $median * (2 ** (0.5 / 12)) } @hz;
	my $below = scalar grep { $_ <= $median / (2 ** (0.5 / 12)) } @hz;
	statusout "$median ($above above, $below below)\n";
	if(open my $fh, ">", $cache)
	{
		print $fh "$median\n";
		close $fh;
	}
	return $pitch_cache[$pitch] = $median;
}
sub find_pitch_cached($)
{
	my ($freq) = @_;

	# perform a binary search in the pitch cache

	my $mi = $ESPEAK_PITCH_MIN;
	my $ma = $ESPEAK_PITCH_MAX;
	return $mi
		if $mi == $ma;
	my $mihz = get_pitch_cached($mi);
	return $mi
		if $freq <= $mihz;
	my $mahz = get_pitch_cached($ma);
	die "CACHE INCONSISTENCY: $mahz not > $mihz"
		if $mahz <= $mihz;
	return $ma
		if $freq >= $mahz;
	while($ma - $mi > 1)
	{
		my $cur = int(($mi + $ma) / 2);
		my $curhz = get_pitch_cached($cur);
		die "CACHE INCONSISTENCY: $curhz not > $mihz"
			if $curhz <= $mihz;
		die "CACHE INCONSISTENCY: $curhz not < $mahz"
			if $curhz >= $mahz;
		if($freq > $curhz)
		{
			$mi = $cur;
			$mihz = $curhz;
		}
		elsif($freq < $curhz)
		{
			$ma = $cur;
			$mahz = $curhz;
		}
		else
		{
			return $cur; # never gonna happen anyway
		}
	}
	if($freq > ($mihz + $mahz) / 2)
	{
		return $ma;
	}
	else
	{
		return $mi;
	}
}

my @last_syllables = ();
sub play_note($$$$$$$$)
{
	my ($tick, $dtick, $t, $dt, $note, $velocity, $pitchbend, $syllable) = @_;

	# kill whitespace (it does nothing)
	$syllable =~ s/^\s+//;
	$syllable =~ s/\s+$//;

	# iterative find fitting -p (pitch, 0..99) and -s (speed, 80..450)
	# parameters for espeak

	my $pitchbend_str = "";
	my $pitchbend_t = 0;
	for(@$pitchbend)
	{
		my ($start, $end, $semitones) = @$_;
		my $delay = $start - $pitchbend_t;
		my $duration = $end - $start;
		$pitchbend_t = $end;
		my $cents = $semitones * 100;
		my $cents_str = sprintf "%.1f", $cents;
		$cents_str = "0.001"
			if $cents_str eq "0.0"; # prevent sox error
		$duration = 0.001
			if $duration < 0.001;
		$pitchbend_str .= sprintf " %.6f,%s,%.6f",
			$delay, $cents_str, $duration;
	}
	$pitchbend_str = "bend$pitchbend_str"
		if length $pitchbend_str;

	my $hz = 440 * 2**(($note - 69 + $ESPEAK_TRANSPOSE) / 12);
	my $pitch = $ESPEAK_PITCH_START;
	my $speed = $ESPEAK_SPEED_START;
	my $evelocity = int($ESPEAK_VELOCITY_MIN
		+ ($velocity / 127.0)
			* ($ESPEAK_VELOCITY_MAX - $ESPEAK_VELOCITY_MIN) + 0.5);

	my $propstr = Digest::SHA::sha1_hex join "/",
		$pitchbend_str, $syllable;
	my $outname = sprintf "%s/%s_%s_%d_%.2f_%.2f_%.2f_%s.wav",
		$ESPEAK_CACHE, $ESPEAK_CACHE_PREFIX, $ESPEAK_VOICE, $SOX_RATE, $dt, $hz, $velocity, $propstr;

	push @last_syllables, $syllable;
	shift @last_syllables
		if @last_syllables > 10;
	statusline "@last_syllables";

	if(!-f $outname)
	{
		statusout "Rendering $outname...";
		my $thisdt;
		my $thishz;
		my $data;
		if($ESPEAK_PITCH_CACHE)
		{
			$pitch = find_pitch_cached($hz);
		}
		for(1..$ESPEAK_ATTEMPTS)
		{
			$data = get_voice_sample $pitch, $evelocity, $speed, $syllable;
			if(!length $data)
			{
				warn "$SOX_PROCESS_IN_TO_S16LE: No data";
				return;
			}

			$thisdt = (length($data) / 2) / $SOX_RATE;

			if($ESPEAK_PITCH_CACHE)
			{
				$thishz = get_pitch_cached($pitch);
			}
			else
			{
				my @data = unpack "s*", $data;
				$thishz = $SOX_RATE / getpitch(\@data, $SOX_RATE / $ANALYZE_MAXFREQ, $SOX_RATE / $ANALYZE_MINFREQ, undef);
				$pitch = [sort { $a <=> $b } (
					$ESPEAK_PITCH_MIN,
					int($pitch * $hz / $thishz + 0.5),
					$ESPEAK_PITCH_MAX)]->[1];
			}

			$speed = [sort { $a <=> $b } (
				$ESPEAK_SPEED_MIN,
				int($speed * $thisdt / $dt + 0.5),
				$ESPEAK_SPEED_MAX)]->[1];
		}

		statusout "pitch=$pitch velocity=$evelocity speed=$speed $pitchbend_str";

		my $pitchfix = $hz / $thishz;
		my $lengthfix = $dt / $thisdt;
		if(!$ESPEAK_PITCH_CACHE)
		{
			if($pitchfix <= 0 || abs(log($pitchfix)) > 0.1)
			{
				warn "Large pitch correction: $thishz -> $hz";
				getpitch([unpack("s*", $data)], $SOX_RATE / $ANALYZE_MAXFREQ, $SOX_RATE / $ANALYZE_MINFREQ, "$outname.plot");
			}
		}
		my $rate = $SOX_RATE;

		my $tempofix = 1 / $lengthfix;

		if(!$SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT_USES_PITCH)
		{
			my $rate0 = $rate;
			$rate = int(0.5 + $rate0 * $pitchfix);
			$tempofix *= $rate0 / $rate;;
			$pitchfix = 1;
		}

		$tempofix = $SOX_TEMPO_MIN
			if $tempofix < $SOX_TEMPO_MIN;
		$tempofix = $SOX_TEMPO_MAX
			if $tempofix > $SOX_TEMPO_MAX;

		my $fh = do
		{
			local $ENV{PREEFFECTS_CHANNELS} = $SOX_PREEFFECTS_CHANNELS;
			local $ENV{AFTEREFFECTS} = $SOX_AFTEREFFECTS;
			local $ENV{AFTEREFFECTS_CHANNELS} = $SOX_AFTEREFFECTS_CHANNELS;
			local $ENV{RATE} = $rate;
			local $ENV{TEMPO} = $tempofix;
			local $ENV{PITCH} = $pitchfix;
			local $ENV{OUT} = $outname;
			local $ENV{PITCHBEND} = $pitchbend_str;
			local $ENV{TEMP} = $ESPEAK_TEMPDIR;
			open my $fh, '|-', $SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT
				or die "$SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT: $!";
			$fh;
		};
		print $fh $data;
		close $fh
			or die "$SOX_PROCESS_TEMPO_PITCHBEND_S16LE_TO_OUT: $! $?";

		statusout "\n";
	}

	$out->{sample}->($out_self, $tick, $dtick, $t, $dt, $outname);
}

# we store tempo as seconds per tick
my @tempi = map {
	$_->[0] eq 'set_tempo'
		? ([$_->[1], $_->[2] * 0.000001 / $opus->ticks()])
		: ();
} abstime $opus->tracks_r()->[0]->events();
sub tick2sec($)
{
	my ($tick) = @_;
	my $sec = 0;
	my $curtempo = [0, 0.5 / $opus->ticks()];
	for(@tempi)
	{
		if($_->[0] < $tick)
		{
			# this event is in the past
			# we add the full time since the last one then
			$sec += ($_->[0] - $curtempo->[0]) * $curtempo->[1];
		}
		else
		{
			# if this event is in the future, we break
			last;
		}
		$curtempo = $_;
	}
	$sec += ($tick - $curtempo->[0]) * $curtempo->[1];
	return $sec;
};

sub hmmsshh($)
{
	my ($t) = @_;
	my $ticks = int($t * 100 + 0.5);
	my $hh = $ticks % 100; $ticks = int($ticks / 100);
	my $SS = $ticks % 60; $ticks = int($ticks / 60);
	my $MM = $ticks % 60; $ticks = int($ticks / 60);
	my $H = $ticks;
	return sprintf "%d:%02d:%02d.%02d", $H, $MM, $SS, $hh;
}
my $ass_line = "";
my $ass_starttime = undef;
my $ass_lasttime = undef;
my $ass_endtime = undef;
my $ass_first = 1;
my $ass_wantspace = "";
sub dump_lyric($$$$)
{
	my ($start, $end, $text, $endofline) = @_;

	return
		if $OUTPUT_FILE eq '-';

	my $endtime = undef;
	if (defined $ass_endtime)
	{
		if (defined $start)
		{
			my $d = $start - $ass_endtime;
			$d = $ASS_PRETIME + $ASS_POSTTIME
				if $d > $ASS_PRETIME + $ASS_POSTTIME;
			$d = 0
				if $d < 0;
			$d *= $ASS_POSTTIME / ($ASS_PRETIME + $ASS_POSTTIME);
			$endtime = $ass_endtime + $d;
		}
		else
		{
			$endtime = $ass_endtime + $ASS_POSTTIME;
		}
		my $startstr = hmmsshh $ass_starttime;
		my $endstr = hmmsshh $endtime;
		statusout "";
		print "Dialogue: 0,$startstr,$endstr,Karaoke,eCantorix,0000,0000,0000,,$ass_line\n";
		$ass_line = "";
		undef $ass_starttime;
		undef $ass_lasttime;
		undef $ass_endtime;
		$ass_wantspace = "";
	}

	# just finalize if $text is not defined
	return
		if not defined $text;

	if ($ass_first && $endofline) {
		$ass_first = 0;
		statusout "";
		print <<EOF;
[Script Info]
; Script generated by eCantorix
Title: <untitled>
Original Script: <unknown>
ScriptType: v4.00+
Collisions: Normal
PlayResX: 720
PlayResY: 480
PlayDepth: 0
Timer: 100.0000
WrapStyle: 0

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, TertiaryColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, AlphaLevel, Encoding
Style: Karaoke,Arial,28,&H00FF0000,&H00FFFF00,&H00FFFFFF,&H80000000,-1,0,0,0,100,100,0.00,0.00,1,1.00,2.00,2,30,30,30,0

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
EOF
	}

	$text =~ s/^(\s*)//;
	my $space = $ass_wantspace . $1;
	$text =~ s/(\s*)$//;
	$ass_wantspace = $1;

	if (!defined $ass_starttime)
	{
		if (defined $endtime)
		{
			my $d = $start - $endtime;
			$d = $ASS_PRETIME
				if $d > $ASS_PRETIME;
			$d = 0
				if $d < 0;
			$ass_starttime = $start - $d;
		}
		else
		{
			$ass_starttime = $start - $ASS_PRETIME;
		}
		$ass_lasttime = $ass_starttime;
		$space = "";
	}

	# advance to $start
	my $ass_delta = int(100 * ($start - $ass_lasttime) + 0.5);
	$ass_delta = 0
		if $ass_delta < 0;
	$ass_line .= "{\\k$ass_delta}"
		if $ass_delta > 0 || $space ne "";
	$ass_line .= $space;
	# output lyrics
	my $ass_length = int(100 * ($start + ($end - $start) * $ASS_LENGTHFACTOR - $ass_lasttime) + 0.5 - $ass_delta);
	$ass_length = 0
		if $ass_length < 0;
	$ass_line .= "{\\kf$ass_length}"
		if $ass_length > 0;
	$ass_line .= $text;
	$ass_lasttime += 0.01 * ($ass_delta + $ass_length);

	if ($endofline)
	{
		$ass_endtime = $end;
	}
}

my $tracks = $opus->tracks_r();

my $totallen = 0;
for my $trackno(0..@$tracks-1)
{
	my $track = $tracks->[$trackno];
	my @events = abstime $track->events();
	$totallen = $events[-1][1]
		if @events and $events[-1][1] > $totallen;;
}
my $totaltime = tick2sec $totallen;
$out->{header}->($out_self, $totallen, $totaltime, \@tempi)
	if defined $out;

my %text_event_types = map { $_ => 1 } qw(lyric text_event);
my %note_event_types = map { $_ => 1 } qw(note_on note_off);

my $lyricstrack = undef;
my $lyricstextcount = 0;
my $lyricsnotecount = 0;
for my $trackno(0..@$tracks-1)
{
	statusout "Analyzing track $trackno...\n";
	my $track = $tracks->[$trackno];
	my @events = abstime $track->events();
	my $has_lyrics = grep { $text_event_types{$_->[0]} } @events;
	my $has_notes = grep { $note_event_types{$_->[0]} } @events;
	if ($has_lyrics > $lyricstextcount) {
		($lyricstrack, $lyricstextcount, $lyricsnotecount) = ($trackno, $has_lyrics, $has_notes);
	}
}
$lyricstrack = undef
	if $lyricsnotecount > 0;

for my $trackno(0..@$tracks-1)
{
	statusout "Processing track $trackno...\n";
	my $track = $tracks->[$trackno];

	my @events = abstime $track->events();

	if (defined $lyricstrack)
	{
		statusout "... using lyrics track $trackno...\n";
		next
			if $trackno == $lyricstrack;
		@events = sorttime(
			@events, abstime $tracks->[$lyricstrack]->events());
	}

	# lyrics = array of [starttick, text, channels]
	# channels = hash of channel => notes
	# notes = array of [starttick, ticks, pitch]
	my @lyrics = map {
		$text_event_types{$_->[0]} ? [$_->[1], $_->[2], {}] : ()
	} @events;
	my $insert_note = sub
	{
		my ($starttick, $ticks, $channel, $pitch, $velocity) = @_;
		my $found = undef;
		for(@lyrics)
		{
			if($_->[0] <= $starttick)
			{
				$found = $_;
			}
		}
		if($found)
		{
			push @{$found->[2]{$channel}},
				[$starttick, $ticks, $pitch, $velocity];
			return 1;
		}
		else
		{
			return undef;
		}
	};

	my @notes = ();
	my %notehash;
	for(@events)
	{
		$_->[0] = 'note_off' if $_->[0] eq 'note_on' and $_->[4] == 0;
		next
			if $_->[0] ne 'note_on' and $_->[0] ne 'note_off';
		my $time = $_->[1];
		my $channel = $_->[2];
		my $pitch = $_->[3];
		my $velocity = $_->[4];
		if($_->[0] eq 'note_on')
		{
			if(exists $notehash{$channel}{$pitch})
			{
				warn "note_already_on: $time/$channel/$pitch";
				next;
			}
			$notehash{$channel}{$pitch} = [$time, $velocity];
		}
		else
		{
			if(!exists $notehash{$channel}{$pitch})
			{
				warn "Spurious note_off: $time/$channel/$pitch";
				next;
			}
			my ($starttime, $startvelocity) = @{$notehash{$channel}{$pitch}};
			delete $notehash{$channel}{$pitch};

			$insert_note->(
				$starttime, $time - $starttime,
				$channel, $pitch, $startvelocity)
				or warn "No lyrics found for note: $starttime/$time/$channel/$pitch";
		}
	}

	for(@lyrics)
	{
		my ($starttick, $text, $channels) = @$_;
		next
			if $text =~ /^%/;
		$text =~ s/^\///;
		$text =~ s/^\\//;
		my $has_newline = 0;
		$has_newline = 1
			if $text =~ s/\\n\s*$//;

		if($EDIT_SYLLABLES)
		{
			local $_ = $text;
			$EDIT_SYLLABLES->();
			$text = $_;
		}

		next
			if $text !~ /\S/;

		for my $channel(sort keys %$channels)
		{
			my $notes = $channels->{$channel};
			my $realstarttick = $notes->[0]->[0];
			my $realendtick = $notes->[-1]->[0] + $notes->[-1]->[1];
			my $realstarttime = tick2sec $realstarttick;
			my $realendtime = tick2sec $realendtick;
			next
				if $realendtime - $realstarttime > 30;
			dump_lyric $realstarttime, $realendtime, $text, $has_newline;
			next
				if not defined $out;
			my @pitchbend = ();
			my $lastnoteendtime = $realstarttime;
			my $sumpitch = 0;
			my $sumvelocity = 0;
			my $sumtime = 0;
			for(@$notes)
			{
				my ($notestarttick, $noteticks, $pitch, $velocity) = @$_;
				my $noteendtick = $notestarttick + $noteticks;
				my $notestarttime = tick2sec($notestarttick);
				my $noteendtime = tick2sec($noteendtick);
				die "Overlapping notes: $notestarttime/$channel/$pitch"
					if $notestarttime < $lastnoteendtime;
				$sumtime += $noteendtime + $notestarttime;
				$sumpitch += ($noteendtime + $notestarttime) * $pitch;
				$sumvelocity += ($noteendtime + $notestarttime) * $velocity;
				push @pitchbend, [
					$lastnoteendtime - $realstarttime,
					$notestarttime - $realstarttime,
					$pitch];
				$lastnoteendtime = $noteendtime;
			}
			my $avgpitch = $sumpitch / $sumtime;
			my $avgvelocity = $sumvelocity / $sumtime;
			$_->[2] -= $avgpitch
				for @pitchbend;
			shift @pitchbend
				while @pitchbend
					and abs($pitchbend[0][2]) < 0.001;
			play_note
				$realstarttick,
				$realendtick - $realstarttick,
				$realstarttime,
				$realendtime - $realstarttime,
				$avgpitch, $avgvelocity, \@pitchbend, $text;
		}
	}

	dump_lyric undef, undef, undef, undef;
}

$out->{footer}->($out_self)
	if defined $out;