1
#!/bin/bash
2
# -*- mode: sh; coding: utf-8 -*-
3
# $Date: 2008/03/01 06:26:19 $
4
5
# Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters).
6
LC_ALL=POSIX.UTF8 sed -e "# I use the doublequote because I use ' in the scripts.
7
# NB: take care when writing this map that it should basically be injective
8
# (except for unimportant variants like s^' and s'^): if it's not, it means
9
# there is a mess in your transcription system, and you should fix your trancripts.
10
# FIXME: Terrible bug: now, '\Fut' gives '\wˣut'. This is not elegantly
11
# avoidable in this sed script, so I'll rewrite the program in another language.
12
#############################################
13
# Very special combinations of symbols 
14
# (these transformations must come in the script before the other
15
# transformation because the other are more general
16
# and would match these special cases as well.)
17
# Sometimes I wrote the phonetic variants of sʹ, zʹ:
18
# (FIXME: when I should use sʹ and ś in the recoded script?)
19
# (s^' can also represent Russian s^', as in 'es^'o'.)
20
s|s'^|ś|g
21
s|s^'|ś|g
22
s|z'^|ź|g
23
s|z^'|ź|g
24
# possibly I could write the phonetic variant of t':
25
s|c^'|ć|g # soft, of course
26
s|c'^|ć|g
27
# FIXME: How did I write an analoguous variant of d'? E.g., the word for \`Теньгушево' after a voiced? Or it is hard?
28
################################################
29
# Fricatives:
30
s|s^|š|g # fricative
31
s|c^|č|g # affricate, pronounced hard
32
s|z^|ž|g # fricative
33
s|Z^|ǯ|g # affricate
34
# At last, the simplest combinations:
35
#s|Z|ʒ|g # affricate -- Commented out, because there seems there weren't this sound in Mordvin or Komi speech, but I wrote it by mistake. So this conversion will catch the errors.
36
# (FIXME: What about affricates with the palatal sibilants: ź, ś? Z^'; and c^' is already present.
37
###############################################
38
# Vowels:
39
s|@|ə|g # schwa (unclear vowel that can be pronounced by them with any quality)
40
# FIXME: (Komi) try to put the Cyrillic-o-umlaut in the Cyrillic orthography; example:
41
#  казьт воспоминание казьт@д
42
#
43
###############################################
44
# The voiceless:
45
s|R|rˣ|g # the voiceless r
46
s|L|lˣ|g # the voiceless l
47
#s|F|wˣ|g # the voiceless w (sounds also like x, h)
48
# -- disabled 'F' because of the '\Fut' bug. Actually,
49
# 'W' for this sound looks more systematic in my system, so perhaps
50
# I should have used it, shouldn't I?
51
s|W|wˣ(?)|g # FIXME: sometimes I wrote 'W', is it the same sound?
52
s|J|jˣ|g # the voiceless j (sounds also like x, h; soft? Perhaps, I didn't distinguish 2 sounds--soft and hard--when using this sign.)
53
##############################################
54
# N:
55
s|N|ŋ|g # velar or nasalization
56
h # Save before the destructive checks (copy pattern space to hold space).
57
##########################################
58
# Well-formedness of the input (is checked destructively):
59
# - no ^'s should remain;
60
# - other capitals should not have been used (no S^, no C^ and no plain capitals);
61
# - more?
62
s|\$Date.*\$|| # ignore the 'preamble'
63
s|\\url{.*}|| # ignore filenames, which have been marked up like this: \url{Komi_rus_slovar1.pdf}
64
/\^/ { 
65
w /dev/stderr
66
q 56;} # (mostly GNU sed extensions)
67
/\(^\|[^\\]\)[[:upper:]]/ { # [^\\] -- a workaround for grammatical terms,
68
# to escape these restrictions on Latin characters: write them like \Pres, \Gen etc.
69
w /dev/stderr
70
q 57;} # (mostly GNU sed extensions)
71
g # Restore after the destructive checks (copy/append hold space to pattern space).
72
"
73
74
# FIXME: A "keep-going" mode: detect all the non-well-formednesses, and accumulate all
75
# the error codes in the exitCode (say, as bits).
76
77
exitCode="$?"
78
case "$exitCode" in
79
56) echo $"-- error: non well-formed input (a left-over '^')!" >/dev/stderr
80
    ;;
81
57) echo $"-- error: non well-formed input (a left-over capital Latin char)!" >/dev/stderr
82
    ;;
83
esac
84
exit "$exitCode"
85
86
# (Mordvin) Check that I have marked softenss? E.g., :
87
# |ti|t'i|