# Copyright (C) 2023 Masaya YAMAGUCHI

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

use utf8;
use strict;

binmode STDIN, ':utf8';
binmode STDOUT, ':raw:utf8';
binmode STDERR, ':utf8';

my $filename = shift(@ARGV);

my $tag_corpus = "jlcc";
my $corpus_name = "jlcc";
my @attributes = (
    "", # author
    "", # boundary mark
    "t", # TEXT
    "h", # PRON
    "r", # YOMI
    "l", # lemma
    "p", # pos
    "c", # conj
    "f", # form
    "e", # YOMI (lemma)
    "g", # goshu
    "n", # tensaku
    "s", # type (tensaku)
    );
my @s = ();
my $sn = 1; # sentence number
my $flagFirstRecord = 1;
my $id = $filename;
$id =~ s/.+\///;
$id =~ s/\.txt$//;

print '<?xml version="1.0" encoding="utf-8"?>';
print "\n";

while(<>){
    s/[\r\n]+$//;
    my @w = split("\t");

    if($flagFirstRecord){
	$flagFirstRecord = 0;
	print "<$tag_corpus id=\"$id\">\n";
    }
    
    if($w[1] eq "B"){
	print_sentence() if(scalar(@s) != 0);
	@s = ();
    }
    push(@s, $_);
}

print_sentence() if(scalar(@s) != 0);
print "</$tag_corpus>\n";



sub print_sentence{

    printf("<s n=\"%04d\">", $sn++);
    
    foreach(@s){
	my @w = split("\t");
	my $i = 0;
	my $line .= "<w";
	
	foreach(@attributes){
	    $line .= " $_=\"" . $w[$i] . "\"" if($_);
	    $i++;
	}
	$line .= ">$w[2]</w>";
	print "$line";
    }
    print "</s>\n";

}
