FreeSWITCH MRCP in Perl
From SoftIVR
FreeSWITCH MRCP in Perl
With FreeSWITCH not having any supported ASR at the time of writing (with the exception of PocketSphinx), we needed something to allow us to connect it to an MRCP server to test SoftIVR's ASR functionality. After a few false starts, we implemented a simple MRCP connector using the outbound socket interface, unicast and a bit of Perl.
The surprising thing (apart from its working at all - after all, using Perl to take a raw audio stream and turn it in to RTP wasn't exactly part of its original design brief) is that it appears to work pretty well, and doesn't consume significant amounts of CPU. The two downsides of using this method are (a) that all clients need to be using PCMU @ 8kHz - this could be fixed with a quick patch to the unicast code, I think - and that there's no scope for playing a prompt to the caller while recognition is taking place.
Here's the code, punctuated periodically with comments. Feel free to use it, but we're unlikely to provide much support if you do; there is a pint on offer at the next ClueCon for anyone who finds a real bug.
# Configure where we are, and where the MRCP server lives: my %asr_conf = ( local_ip => "1.2.3.4", server_ip => "1.2.3.4", port => 554 ); # Call with a grammar as the first parameter and, optionally, "timeout=n" as the second sub jsASR($;$) { my $grammar = $_[0]; my $opts = ""; $opts = $_[1] if (@_ > 1); my $timeout = 5; $timeout = $1 if ($opts =~ /timeout=([0-9.]+)/); my $cseq = 1; my $rtp_port = 10000; # Set up the RTP socket - do this first, so we know our local port no. my $rtp; while ((!$rtp) && ($rtp_port < 100128)) { $rtp = IO::Socket::INET->new(LocalHost => $asr_conf{local_ip}, LocalPort => $rtp_port, Proto => 'udp') or $rtp_port += 2; } if (!$rtp) { return "ERROR No RTP port available for ASR"; } # Build initial setup message my $sdp = "v=0\r\n"; $sdp .= "o=SoftIVR 0 0 IN IP4 $asr_conf{local_ip}\r\n"; $sdp .= "s=$uid\r\n"; $sdp .= "t=0 0\r\n"; $sdp .= "c=IN IP4 $asr_conf{local_ip}\r\n"; $sdp .= "m=audio $rtp_port RTP/AVP 0 8\r\n"; $sdp .= "a=rtpmap:0 PCMU/8000\r\n"; $sdp .= "a=sendonly\r\n"; my $setup = "SETUP rtsp://$asr_conf{server_ip}:$asr_conf{port}/recognizer RTSP/1.0\r\n"; $setup .= "Cseq:$cseq\r\n"; $setup .= "Transport: RTP/AVP;unicast;client_port=" . $rtp_port . "-" . ($rtp_port+1) . "\r\n"; $setup .= "Content-type: application/sdp\r\n"; $setup .= "Content-length: " . length($sdp) . "\r\n"; $setup .= "\r\n"; $setup .= $sdp; my $rtsp = IO::Socket::INET->new(LocalHost => $asr_conf{local_ip}, PeerAddr => $asr_conf{server_ip}, PeerPort => $asr_conf{port}, Blocking => 0); if (!$rtsp) { jsLog("Failed to establish ASR session with $asr_conf{server_ip}:$asr_conf{port}"); $rtp->close; return "ERROR Failed to establish ASR session"; } my $asr_sel = IO::Select->new($rtsp); $asr_sel->can_write(5); print $rtsp $setup; # Get the reply.. $asr_sel->can_read(5); $rtsp->recv(my $response, 2048); if ($response !~ /RTSP\/1\.0\s+200/) { $rtp->close; $rtsp->close; jsLog("ASR session setup failed: $response"); return "ERROR Failed to establish ASR session"; } # Get server port and IP to which to send audio + session ID my $server_rtp_ip; my $server_rtp_port; my $session_id; $server_rtp_port = $1 if ($response =~ /server_port=([0-9]+)/i); $server_rtp_ip = $asr_conf{server_ip}; $server_rtp_ip = $1 if ($response =~ /destination=([0-9.]+)/i); $session_id = $1 if ($response =~ /session:\s*([0-9A-Z.]+)/i); if (!defined($server_rtp_ip) || (!defined($server_rtp_port)) || (!defined($session_id))) { $rtp->close; $rtsp->close; jsLog("ASR session setup: no server IP/port in $response"); return "ERROR NO SESSION"; } # Load the grammar $cseq++; my $mrcp = "RECOGNIZE 1 MRCP/1.0\r\n"; if ($grammar =~ /^<\?xml/) { $mrcp .= "Content-type: application/srgs+xml\r\n"; } else { $mrcp .= "Content-type: application/srgs\r\n"; } $mrcp .= "Content-Id: $uid\@softivr\r\n"; $mrcp .= "Content-length: " . length($grammar) . "\r\n\r\n"; $mrcp .= $grammar; $setup = "ANNOUNCE rtsp://$asr_conf{server_ip}:$asr_conf{port}/recognizer RTSP/1.0\r\n"; $setup .= "Cseq:$cseq\r\n"; $setup .= "Session: $session_id\r\n"; $setup .= "Content-type: application/mrcp\r\n"; $setup .= "Content-length: " . length($mrcp) . "\r\n\r\n"; $setup .= $mrcp; $asr_sel->can_write(5); $rtsp->send($setup); # Check OK $asr_sel->can_read(5); $rtsp->recv($response, 2048); if ($response !~ /RTSP\/1\.0\s+200/) { $rtp->close; $rtsp->close; jsLog("ASR grammar load failed: $response"); return "ERROR NO SESSION"; } # Open local socket for audio data from FS print "$$ Opening S2M listener\n"; my $s2m = IO::Socket::INET->new(LocalAddr => '127.0.0.1', LocalPort => $rtp_port, Proto => 'udp'); if (!$s2m) { $rtp->close; $rtsp->close; jsLog("S2M socket open failed"); return "ERROR NO SESSION"; } my $rtppacket = Net::RTP::Packet->new(); $rtppacket->payload_type(0); $rtppacket->seq_num(0); $rtppacket->timestamp(0); $rtppacket->marker(1); $asr_sel->add($s2m); # Get FS to send audio to us # Note the use of 'native' - means don't transcode; this all relies on everyone using PCMU. print $sock "sendmsg\r\ncall-command: unicast\r\nlocal-ip: 127.0.0.1\r\nlocal-port: " . ($rtp_port+1) . "\r\nremote-ip: 127.0.0.1\r\n" . "remote-port: " . $rtp_port . "\r\ntransport: udp\r\nflags: native\r\n\r\n"; my @ready; my $tstart = time(); my $inrec = 0; my $server_in = sockaddr_in($server_rtp_port, inet_aton($server_rtp_ip)); ASR: while (@ready = $asr_sel->can_read(1)) { $response = ""; # Check for a timeout - i.e. timeout reached before we've a start of speech notification last if (($inrec == 0) && ((time() - $tstart) > $timeout)); foreach my $sh (@ready) { if ($sh == $rtsp) { print "$$ RTSP inbound\n"; $rtsp->read($response, 2048); my $clen = 0; $clen = $1 if ($_ =~ /content-length:\s*([0-9]+)/i); if ($clen > 0) { $response =~ s/^.*(\r?\n){2}(.*)$/$2/s; } $clen -= length($response); print "$$ $clen\n"; while (($clen > 0) && ((my $br = $rtsp->read(my $tmp, $clen)) > 0)) { $response .= $tmp; $clen -= length($tmp); } if ($response =~ /RECOGNITION-COMPLETE/is) { $clen = 0; $clen = $1 if ($response =~ /content-length:\s*([0-9]+)/is); if ($clen > 0) { $response =~ s/^.*(\r?\n){2}(.*)$/$2/s; } print "$$ RTSP result: $response\n"; last ASR; } if ($response =~ /START-OF-SPEECH/is) { $inrec = 1; } } if ($sh == $s2m) { # Got some audio, so top it with an RTP header and send it on $s2m->recv($response, 2048); $rtppacket->payload($response); $rtp->send($rtppacket->encode, 0, $server_in); $rtppacket->seq_num_increment(1); $rtppacket->timestamp_increment(length($response)); $rtppacket->marker(0); } } } # Tear down session $cseq++; $setup = "TEARDOWN rtsp://$asr_conf{server_ip}:$asr_conf{port}/recognizer RTSP/1.0\r\n"; $setup .= "Cseq:$cseq\r\n"; $setup .= "Session: $session_id\r\n"; $rtsp->send($setup); $rtsp->close(); $s2m->close(); $rtp->close(); return $response; }