1
/*
2
 * ttman - text to man converter
3
 *
4
 * Copyright 2006 Timo Hirvonen <tihirvon@gmail.com>
5
 *
6
 * This file is licensed under the GPLv2.
7
 */
8
#include <stdlib.h>
9
#include <stdarg.h>
10
#include <stdio.h>
11
#include <string.h>
12
#include <ctype.h>
13
#include <sys/types.h>
14
#include <sys/stat.h>
15
#include <sys/mman.h>
16
#include <unistd.h>
17
#include <fcntl.h>
18
#include <errno.h>
19
20
struct token {
21
	struct token *next;
22
	struct token *prev;
23
	enum {
24
		TOK_TEXT,	// max one line w/o \n
25
		TOK_NL,		// \n
26
		TOK_ITALIC,	// `
27
		TOK_BOLD,	// *
28
		TOK_INDENT,	// \t
29
30
		// keywords (@...)
31
		TOK_H1,
32
		TOK_H2,
33
		TOK_LI,
34
		TOK_BR,
35
		TOK_PRE,
36
		TOK_ENDPRE,	// must be after TOK_PRE
37
		TOK_RAW,
38
		TOK_ENDRAW,	// must be after TOK_RAW
39
		TOK_TITLE,	// WRITE 2 2001-12-13 "Linux 2.0.32" "Linux Programmer's Manual"
40
	} type;
41
	int line;
42
43
	// not NUL-terminated
44
	const char *text;
45
	// length of text
46
	int len;
47
};
48
49
static const char *program;
50
static const char *filename;
51
static char tmp_file[1024];
52
static FILE *outfile;
53
static int cur_line = 1;
54
static struct token head = { &head, &head, TOK_TEXT, 0, NULL, 0 };
55
56
#define CONST_STR(str) { str, sizeof(str) - 1 }
57
static const struct {
58
	const char *str;
59
	int len;
60
} token_names[] = {
61
	CONST_STR("text"),
62
	CONST_STR("nl"),
63
	CONST_STR("italic"),
64
	CONST_STR("bold"),
65
	CONST_STR("indent"),
66
67
	// keywords
68
	CONST_STR("h1"),
69
	CONST_STR("h2"),
70
	CONST_STR("li"),
71
	CONST_STR("br"),
72
	CONST_STR("pre"),
73
	CONST_STR("endpre"),
74
	CONST_STR("raw"),
75
	CONST_STR("endraw"),
76
	CONST_STR("title")
77
};
78
#define NR_TOKEN_NAMES (sizeof(token_names) / sizeof(token_names[0]))
79
#define BUG() die("BUG in %s\n", __FUNCTION__)
80
81
#ifdef __GNUC__
82
#define __NORETURN	__attribute__((__noreturn__))
83
#else
84
#define __NORETURN
85
#endif
86
87
static __NORETURN void quit(void)
88
{
89
	if (tmp_file[0])
90
		unlink(tmp_file);
91
	exit(1);
92
}
93
94
static __NORETURN void die(const char *format, ...)
95
{
96
	va_list ap;
97
98
	fprintf(stderr, "%s: ", program);
99
	va_start(ap, format);
100
	vfprintf(stderr, format, ap);
101
	va_end(ap);
102
	quit();
103
}
104
105
static __NORETURN void syntax(int line, const char *format, ...)
106
{
107
	va_list ap;
108
109
	fprintf(stderr, "%s:%d: error: ", filename, line);
110
	va_start(ap, format);
111
	vfprintf(stderr, format, ap);
112
	va_end(ap);
113
	quit();
114
}
115
116
static inline const char *keyword_name(int type)
117
{
118
	if (type < TOK_H1 || type > TOK_TITLE)
119
		die("BUG: no keyword name for type %d\n", type);
120
	return token_names[type].str;
121
}
122
123
static void *xmalloc(size_t size)
124
{
125
	void *ret = malloc(size);
126
127
	if (!ret)
128
		die("OOM when allocating %ul bytes\n", size);
129
	return ret;
130
}
131
132
static char *memdup(const char *str, int len)
133
{
134
	char *s = xmalloc(len + 1);
135
	memcpy(s, str, len);
136
	s[len] = 0;
137
	return s;
138
}
139
140
static struct token *new_token(int type)
141
{
142
	struct token *tok = xmalloc(sizeof(struct token));
143
144
	tok->prev = NULL;
145
	tok->next = NULL;
146
	tok->type = type;
147
	tok->line = cur_line;
148
	return tok;
149
}
150
151
static void free_token(struct token *tok)
152
{
153
	struct token *prev = tok->prev;
154
	struct token *next = tok->next;
155
156
	if (tok == &head)
157
		BUG();
158
159
	prev->next = next;
160
	next->prev = prev;
161
	free(tok);
162
}
163
164
static void emit_token(struct token *tok)
165
{
166
	tok->prev = head.prev;
167
	tok->next = &head;
168
	head.prev->next = tok;
169
	head.prev = tok;
170
}
171
172
static void emit(int type)
173
{
174
	struct token *tok = new_token(type);
175
	tok->len = 0;
176
	tok->text = NULL;
177
	emit_token(tok);
178
}
179
180
static int emit_keyword(const char *buf, int size)
181
{
182
	int i, len;
183
184
	for (len = 0; len < size; len++) {
185
		if (!isalnum((unsigned char)buf[len]))
186
			break;
187
	}
188
189
	if (!len)
190
		syntax(cur_line, "keyword expected\n");
191
192
	for (i = TOK_H1; i < NR_TOKEN_NAMES; i++) {
193
		if (len != token_names[i].len)
194
			continue;
195
		if (!strncmp(buf, token_names[i].str, len)) {
196
			emit(i);
197
			return len;
198
		}
199
	}
200
	syntax(cur_line, "invalid keyword '@%s'\n", memdup(buf, len));
201
}
202
203
static int emit_text(const char *buf, int size)
204
{
205
	struct token *tok;
206
	int i;
207
208
	for (i = 0; i < size; i++) {
209
		int c = buf[i];
210
		if (c == '@' || c == '`' || c == '*' || c == '\n' || c == '\\' || c == '\t')
211
			break;
212
	}
213
	tok = new_token(TOK_TEXT);
214
	tok->text = buf;
215
	tok->len = i;
216
	emit_token(tok);
217
	return i;
218
}
219
220
static void tokenize(const char *buf, int size)
221
{
222
	int pos = 0;
223
224
	while (pos < size) {
225
		struct token *tok;
226
		int ch;
227
228
		ch = buf[pos++];
229
		switch (ch) {
230
		case '@':
231
			pos += emit_keyword(buf + pos, size - pos);
232
			break;
233
		case '`':
234
			emit(TOK_ITALIC);
235
			break;
236
		case '*':
237
			emit(TOK_BOLD);
238
			break;
239
		case '\n':
240
			emit(TOK_NL);
241
			cur_line++;
242
			break;
243
		case '\t':
244
			emit(TOK_INDENT);
245
			break;
246
		case '\\':
247
			tok = new_token(TOK_TEXT);
248
			tok->text = buf + pos;
249
			tok->len = 1;
250
			pos++;
251
			if (pos == size || buf[pos] == '\n') {
252
				// just one '\\'
253
				tok->text--;
254
			}
255
256
			if (tok->text[0] == '\\') {
257
				tok->text = "\\\\";
258
				tok->len = 2;
259
			}
260
261
			emit_token(tok);
262
			break;
263
		default:
264
			pos--;
265
			pos += emit_text(buf + pos, size - pos);
266
			break;
267
		}
268
	}
269
}
270
271
static int is_empty_line(const struct token *tok)
272
{
273
	while (tok != &head) {
274
		int i;
275
276
		switch (tok->type) {
277
		case TOK_TEXT:
278
			for (i = 0; i < tok->len; i++) {
279
				if (tok->text[i] != ' ')
280
					return 0;
281
			}
282
			break;
283
		case TOK_INDENT:
284
			break;
285
		case TOK_NL:
286
			return 1;
287
		default:
288
			return 0;
289
		}
290
		tok = tok->next;
291
	}
292
	return 1;
293
}
294
295
static struct token *remove_line(struct token *tok)
296
{
297
	while (tok != &head) {
298
		struct token *next = tok->next;
299
		int type = tok->type;
300
301
		free_token(tok);
302
		tok = next;
303
		if (type == TOK_NL)
304
			break;
305
	}
306
	return tok;
307
}
308
309
static struct token *skip_after(struct token *tok, int type)
310
{
311
	struct token *save = tok;
312
313
	while (tok != &head) {
314
		if (tok->type == type) {
315
			tok = tok->next;
316
			if (tok->type != TOK_NL)
317
				syntax(tok->line, "newline expected after @%s\n",
318
						keyword_name(type));
319
			return tok->next;
320
		}
321
		if (tok->type >= TOK_H1)
322
			syntax(tok->line, "keywords not allowed betweed @%s and @%s\n",
323
					keyword_name(type-1), keyword_name(type));
324
		tok = tok->next;
325
	}
326
	syntax(save->prev->line, "missing @%s\n", keyword_name(type));
327
}
328
329
static struct token *get_next_line(struct token *tok)
330
{
331
	while (tok != &head) {
332
		int type = tok->type;
333
334
		tok = tok->next;
335
		if (type == TOK_NL)
336
			break;
337
	}
338
	return tok;
339
}
340
341
static struct token *get_indent(struct token *tok, int *ip)
342
{
343
	int i = 0;
344
345
	while (tok != &head && tok->type == TOK_INDENT) {
346
		tok = tok->next;
347
		i++;
348
	}
349
	*ip = i;
350
	return tok;
351
}
352
353
// line must be non-empty
354
static struct token *check_line(struct token *tok, int *ip)
355
{
356
	struct token *start;
357
	int tok_type;
358
359
	start = tok = get_indent(tok, ip);
360
361
	tok_type = tok->type;
362
	switch (tok_type) {
363
	case TOK_TEXT:
364
	case TOK_BOLD:
365
	case TOK_ITALIC:
366
	case TOK_BR:
367
		tok = tok->next;
368
		while (tok != &head) {
369
			switch (tok->type) {
370
			case TOK_TEXT:
371
			case TOK_BOLD:
372
			case TOK_ITALIC:
373
			case TOK_BR:
374
			case TOK_INDENT:
375
				break;
376
			case TOK_NL:
377
				return start;
378
			default:
379
				syntax(tok->line, "@%s not allowed inside paragraph\n",
380
						keyword_name(tok->type));
381
			}
382
			tok = tok->next;
383
		}
384
		break;
385
	case TOK_H1:
386
	case TOK_H2:
387
	case TOK_TITLE:
388
		if (*ip)
389
			goto indentation;
390
391
		// check arguments
392
		tok = tok->next;
393
		while (tok != &head) {
394
			switch (tok->type) {
395
			case TOK_TEXT:
396
			case TOK_INDENT:
397
				break;
398
			case TOK_NL:
399
				return start;
400
			default:
401
				syntax(tok->line, "@%s can contain only text\n",
402
						keyword_name(tok_type));
403
			}
404
			tok = tok->next;
405
		}
406
		break;
407
	case TOK_LI:
408
		// check arguments
409
		tok = tok->next;
410
		while (tok != &head) {
411
			switch (tok->type) {
412
			case TOK_TEXT:
413
			case TOK_BOLD:
414
			case TOK_ITALIC:
415
			case TOK_INDENT:
416
				break;
417
			case TOK_NL:
418
				return start;
419
			default:
420
				syntax(tok->line, "@%s not allowed inside @li\n",
421
						keyword_name(tok->type));
422
			}
423
			tok = tok->next;
424
		}
425
		break;
426
	case TOK_PRE:
427
		// checked later
428
		break;
429
	case TOK_RAW:
430
		if (*ip)
431
			goto indentation;
432
		// checked later
433
		break;
434
	case TOK_ENDPRE:
435
	case TOK_ENDRAW:
436
		syntax(tok->line, "@%s not expected\n", keyword_name(tok->type));
437
		break;
438
	case TOK_NL:
439
	case TOK_INDENT:
440
		BUG();
441
		break;
442
	}
443
	return start;
444
indentation:
445
	syntax(tok->line, "indentation before @%s\n", keyword_name(tok->type));
446
}
447
448
static void insert_nl_before(struct token *next)
449
{
450
	struct token *prev = next->prev;
451
	struct token *new = new_token(TOK_NL);
452
453
	new->prev = prev;
454
	new->next = next;
455
	prev->next = new;
456
	next->prev = new;
457
}
458
459
static void normalize(void)
460
{
461
	struct token *tok = head.next;
462
	/*
463
	 * >= 0 if previous line was text (== amount of indent)
464
	 *   -1 if previous block was @pre (amount of indent doesn't matter)
465
	 *   -2 otherwise (@h1 etc., indent was 0)
466
	 */
467
	int prev_indent = -2;
468
469
	while (tok != &head) {
470
		struct token *start;
471
		int i, new_para = 0;
472
473
		// remove empty lines
474
		while (is_empty_line(tok)) {
475
			tok = remove_line(tok);
476
			new_para = 1;
477
			if (tok == &head)
478
				return;
479
		}
480
481
		// skips indent
482
		start = tok;
483
		tok = check_line(tok, &i);
484
485
		switch (tok->type) {
486
		case TOK_TEXT:
487
		case TOK_ITALIC:
488
		case TOK_BOLD:
489
		case TOK_BR:
490
			// normal text
491
			if (new_para && prev_indent >= -1) {
492
				// previous line/block was text or @pre
493
				// and there was a empty line after it
494
				insert_nl_before(start);
495
			}
496
497
			if (!new_para && prev_indent == i) {
498
				// join with previous line
499
				struct token *nl = start->prev;
500
501
				if (nl->type != TOK_NL)
502
					BUG();
503
504
				if ((nl->prev != &head && nl->prev->type == TOK_BR) ||
505
						tok->type == TOK_BR) {
506
					// don't convert \n after/before @br to ' '
507
					free_token(nl);
508
				} else {
509
					// convert "\n" to " "
510
					nl->type = TOK_TEXT;
511
					nl->text = " ";
512
					nl->len = 1;
513
				}
514
515
				// remove indent
516
				while (start->type == TOK_INDENT) {
517
					struct token *next = start->next;
518
					free_token(start);
519
					start = next;
520
				}
521
			}
522
523
			prev_indent = i;
524
			tok = get_next_line(tok);
525
			break;
526
		case TOK_PRE:
527
		case TOK_RAW:
528
			// these can be directly after normal text
529
			// but not joined with the previous line
530
			if (new_para && prev_indent >= -1) {
531
				// previous line/block was text or @pre
532
				// and there was a empty line after it
533
				insert_nl_before(start);
534
			}
535
			tok = skip_after(tok->next, tok->type + 1);
536
			prev_indent = -1;
537
			break;
538
		case TOK_H1:
539
		case TOK_H2:
540
		case TOK_LI:
541
		case TOK_TITLE:
542
			// remove white space after H1, H2, L1 and TITLE
543
			tok = tok->next;
544
			while (tok != &head) {
545
				int type = tok->type;
546
				struct token *next;
547
548
				if (type == TOK_TEXT) {
549
					while (tok->len && *tok->text == ' ') {
550
						tok->text++;
551
						tok->len--;
552
					}
553
					if (tok->len)
554
						break;
555
				}
556
				if (type != TOK_INDENT)
557
					break;
558
559
				// empty TOK_TEXT or TOK_INDENT
560
				next = tok->next;
561
				free_token(tok);
562
				tok = next;
563
			}
564
			// not normal text. can't be joined
565
			prev_indent = -2;
566
			tok = get_next_line(tok);
567
			break;
568
		case TOK_NL:
569
		case TOK_INDENT:
570
		case TOK_ENDPRE:
571
		case TOK_ENDRAW:
572
			BUG();
573
			break;
574
		}
575
	}
576
}
577
578
#define output(...) fprintf(outfile, __VA_ARGS__)
579
580
static void output_buf(const char *buf, int len)
581
{
582
	fwrite(buf, 1, len, outfile);
583
}
584
585
static void output_text(struct token *tok)
586
{
587
	char buf[1024];
588
	const char *str = tok->text;
589
	int len = tok->len;
590
	int pos = 0;
591
592
	while (len) {
593
		int c = *str++;
594
595
		if (pos >= sizeof(buf) - 1) {
596
			output_buf(buf, pos);
597
			pos = 0;
598
		}
599
		if (c == '-')
600
			buf[pos++] = '\\';
601
		buf[pos++] = c;
602
		len--;
603
	}
604
605
	if (pos)
606
		output_buf(buf, pos);
607
}
608
609
static int bold = 0;
610
static int italic = 0;
611
static int indent = 0;
612
613
static struct token *output_pre(struct token *tok)
614
{
615
	int bol = 1;
616
617
	if (tok->type != TOK_NL)
618
		syntax(tok->line, "newline expected after @pre\n");
619
620
	output(".nf\n");
621
	tok = tok->next;
622
	while (tok != &head) {
623
		if (bol) {
624
			int i;
625
626
			tok = get_indent(tok, &i);
627
			if (i != indent && tok->type != TOK_NL)
628
				syntax(tok->line, "indent changed in @pre\n");
629
		}
630
631
		switch (tok->type) {
632
		case TOK_TEXT:
633
			if (bol && tok->len && tok->text[0] == '.')
634
				output("\\&");
635
			output_text(tok);
636
			break;
637
		case TOK_NL:
638
			output("\n");
639
			bol = 1;
640
			tok = tok->next;
641
			continue;
642
		case TOK_ITALIC:
643
			output("`");
644
			break;
645
		case TOK_BOLD:
646
			output("*");
647
			break;
648
		case TOK_INDENT:
649
			// FIXME: warn
650
			output(" ");
651
			break;
652
		case TOK_ENDPRE:
653
			output(".fi\n");
654
			tok = tok->next;
655
			if (tok != &head && tok->type == TOK_NL)
656
				tok = tok->next;
657
			return tok;
658
		default:
659
			BUG();
660
			break;
661
		}
662
		bol = 0;
663
		tok = tok->next;
664
	}
665
	return tok;
666
}
667
668
static struct token *output_raw(struct token *tok)
669
{
670
	if (tok->type != TOK_NL)
671
		syntax(tok->line, "newline expected after @raw\n");
672
673
	tok = tok->next;
674
	while (tok != &head) {
675
		switch (tok->type) {
676
		case TOK_TEXT:
677
			if (tok->len == 2 && !strncmp(tok->text, "\\\\", 2)) {
678
				/* ugly special case
679
				 * "\\" (\) was converted to "\\\\" (\\) because
680
				 * nroff does escaping too.
681
				 */
682
				output("\\");
683
			} else {
684
				output_buf(tok->text, tok->len);
685
			}
686
			break;
687
		case TOK_NL:
688
			output("\n");
689
			break;
690
		case TOK_ITALIC:
691
			output("`");
692
			break;
693
		case TOK_BOLD:
694
			output("*");
695
			break;
696
		case TOK_INDENT:
697
			output("\t");
698
			break;
699
		case TOK_ENDRAW:
700
			tok = tok->next;
701
			if (tok != &head && tok->type == TOK_NL)
702
				tok = tok->next;
703
			return tok;
704
		default:
705
			BUG();
706
			break;
707
		}
708
		tok = tok->next;
709
	}
710
	return tok;
711
}
712
713
static struct token *output_para(struct token *tok)
714
{
715
	int bol = 1;
716
717
	while (tok != &head) {
718
		switch (tok->type) {
719
		case TOK_TEXT:
720
			output_text(tok);
721
			break;
722
		case TOK_ITALIC:
723
			italic ^= 1;
724
			if (italic) {
725
				output("\\fI");
726
			} else {
727
				output("\\fR");
728
			}
729
			break;
730
		case TOK_BOLD:
731
			bold ^= 1;
732
			if (bold) {
733
				output("\\fB");
734
			} else {
735
				output("\\fR");
736
			}
737
			break;
738
		case TOK_BR:
739
			if (bol) {
740
				output(".br\n");
741
			} else {
742
				output("\n.br\n");
743
			}
744
			bol = 1;
745
			tok = tok->next;
746
			continue;
747
		case TOK_NL:
748
			output("\n");
749
			return tok->next;
750
		case TOK_INDENT:
751
			output(" ");
752
			break;
753
		default:
754
			BUG();
755
			break;
756
		}
757
		bol = 0;
758
		tok = tok->next;
759
	}
760
	return tok;
761
}
762
763
static struct token *title(struct token *tok, const char *cmd)
764
{
765
	output("%s", cmd);
766
	return output_para(tok->next);
767
}
768
769
static struct token *dump_one(struct token *tok)
770
{
771
	int i;
772
773
	tok = get_indent(tok, &i);
774
	if (tok->type != TOK_RAW) {
775
		while (indent < i) {
776
			output(".RS\n");
777
			indent++;
778
		}
779
		while (indent > i) {
780
			output(".RE\n");
781
			indent--;
782
		}
783
	}
784
785
	switch (tok->type) {
786
	case TOK_TEXT:
787
	case TOK_ITALIC:
788
	case TOK_BOLD:
789
	case TOK_BR:
790
		if (tok->type == TOK_TEXT && tok->len && tok->text[0] == '.')
791
			output("\\&");
792
		tok = output_para(tok);
793
		break;
794
	case TOK_H1:
795
		tok = title(tok, ".SH ");
796
		break;
797
	case TOK_H2:
798
		tok = title(tok, ".SS ");
799
		break;
800
	case TOK_LI:
801
		tok = title(tok, ".TP\n");
802
		break;
803
	case TOK_PRE:
804
		tok = output_pre(tok->next);
805
		break;
806
	case TOK_RAW:
807
		tok = output_raw(tok->next);
808
		break;
809
	case TOK_TITLE:
810
		tok = title(tok, ".TH ");
811
		// must be after .TH
812
		// no hyphenation, adjust left
813
		output(".nh\n.ad l\n");
814
		break;
815
	case TOK_NL:
816
		output("\n");
817
		tok = tok->next;
818
		break;
819
	case TOK_ENDPRE:
820
	case TOK_ENDRAW:
821
	case TOK_INDENT:
822
		BUG();
823
		break;
824
	}
825
	return tok;
826
}
827
828
static void dump(void)
829
{
830
	struct token *tok = head.next;
831
832
	while (tok != &head)
833
		tok = dump_one(tok);
834
}
835
836
static void process(void)
837
{
838
	struct stat s;
839
	const char *buf;
840
	int fd;
841
842
	fd = open(filename, O_RDONLY);
843
	if (fd == -1)
844
		die("opening `%s' for reading: %s\n", filename, strerror(errno));
845
	fstat(fd, &s);
846
	if (s.st_size) {
847
		buf = mmap(NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
848
		if (buf == MAP_FAILED)
849
			die("mmap: %s\n", strerror(errno));
850
851
		tokenize(buf, s.st_size);
852
		normalize();
853
	}
854
	dump();
855
}
856
857
int main(int argc, char *argv[])
858
{
859
	const char *dest;
860
	int fd;
861
862
	program = argv[0];
863
	if (argc != 3) {
864
		fprintf(stderr, "Usage: %s <in> <out>\n", program);
865
		return 1;
866
	}
867
	filename = argv[1];
868
	dest = argv[2];
869
870
	snprintf(tmp_file, sizeof(tmp_file), "%s.XXXXXX", dest);
871
	fd = mkstemp(tmp_file);
872
	if (fd < 0)
873
		die("creating %s: %s\n", tmp_file, strerror(errno));
874
	outfile = fdopen(fd, "w");
875
	if (!outfile)
876
		die("opening %s: %s\n", tmp_file, strerror(errno));
877
878
	process();
879
	if (rename(tmp_file, dest))
880
		die("renaming %s to %s: %s\n", tmp_file, dest, strerror(errno));
881
	return 0;
882
}