Added workaround for broken ifstream::putback() on MSVC
[hypercube:hypercube.git] / IO / providers / graphml.cpp
1 #include <cstring>
2 #include <cerrno>
3 #include "IO/modules.h"
4 #include "IO/encodings/utf8cvt.h"
5 #include "graphml.h"
6
7 using namespace std;
8
9
10 #define isStartName(c) \
11         (isalnum((c)) || (c) == '_' || (c) == ':')
12 #define isName(c) \
13         (isStartName((c)) || (c) == '-' || (c) == '.')
14
15
16 #define NUM_RELATIONS (sizeof(relations) / sizeof(Relation))
17 const GraphmlGraphInput::Relation GraphmlGraphInput::relations[] = {
18         {L"graphml", L""},
19         {L"graph", L"graphml"},
20         {L"node", L"graph"},
21         {L"edge", L"graph"}
22 };
23
24
25 static std::wstring s2w(const string &s)
26 {
27         wstring w(s.length(), L' ');
28         copy(s.begin(), s.end(), w.begin());
29
30         return w;
31 }
32
33
34 Vertex* GraphmlGraphInput::addVertex(const wstring &id)
35 {
36         Vertex *v;
37         map<wstring, Vertex*>::const_iterator it;
38
39         it = _vertexes.find(id);
40         if (it != _vertexes.end())
41                 return it->second;
42
43         v = _graph->addVertex();
44
45         _vertexes.insert(pair<wstring, Vertex*>(id, v));
46
47         return v;
48 }
49
50 Edge* GraphmlGraphInput::addEdge(const wstring &source, const wstring &target)
51 {
52         Vertex *src, *dst;
53
54         src = addVertex(source);
55         dst = addVertex(target);
56
57         return _graph->addEdge(src, dst);
58 }
59
60 void GraphmlGraphInput::checkRelation(const wstring &node, const wstring &parent)
61 {
62         for (size_t i = 0; i < NUM_RELATIONS; i++) {
63                 if (node == relations[i].node) {
64                         if (parent != relations[i].parent)
65                                 error();
66                         return;
67                 }
68         }
69 }
70
71 void GraphmlGraphInput::setEncoding(const wstring &encoding)
72 {
73         codecvt<wchar_t,char,mbstate_t> *cvt = 0;
74
75         for (Encoding **ep = encodings; *ep; ep++) {
76                 if (stringCaseCmp(encoding, s2w((*ep)->name()))) {
77                         cvt = (*ep)->cvt();
78                         break;
79                 }
80         }
81         if (!cvt) {
82                 cerr << "Unsupported encoding. Using UTF-8." << endl;
83                 cvt = new utf8cvt;
84         }
85
86         locale lc(std::locale(), cvt);
87         _fs.imbue(lc);
88 }
89
90 void GraphmlGraphInput::setAttribute(const wstring &attr, const wstring &value)
91 {
92         if (attr == L"id")
93                 _attributes.id = value;
94         if (attr == L"source")
95                 _attributes.source = value;
96         if (attr == L"target")
97                 _attributes.target = value;
98         if (attr == L"encoding")
99                 _attributes.encoding = value;
100 }
101
102 void GraphmlGraphInput::clearAttributes()
103 {
104         _attributes.id.clear();
105         _attributes.source.clear();
106         _attributes.target.clear();
107 }
108
109 void GraphmlGraphInput::handleElement(const wstring &element)
110 {
111         Vertex *vertex;
112         Edge *edge;
113
114         if (element == L"node") {
115                 if (_attributes.id.empty()) {
116                         error();
117                         return;
118                 }
119                 vertex = addVertex(_attributes.id);
120                 vertex->setText(_attributes.id);
121         }
122         if (element == L"edge") {
123                 if (_attributes.source.empty() || _attributes.target.empty()) {
124                         error();
125                         return;
126                 }
127                 edge = addEdge(_attributes.source, _attributes.target);
128                 if (!_attributes.id.empty())
129                         edge->setText(_attributes.id);
130         }
131 }
132
133
134 void GraphmlGraphInput::error()
135 {
136         if (_token == ERROR)
137                 return;
138
139         ioerr << "GraphML: parse error on line: " << _line << endl;
140         _token = ERROR;
141 }
142
143 void GraphmlGraphInput::nextToken()
144 {
145         int c, state = 0;
146
147
148         while (1) {
149                 c = _fs.get();
150
151                 switch (state) {
152                         case 0:
153                                 if (isspace(c)) {
154                                         if (c == '\n')
155                                                 _line++;
156                                         break;
157                                 }
158                                 if (c == '<') {
159                                         _token = LT;
160                                         return;
161                                 }
162                                 if (c == '>') {
163                                         _token = GT;
164                                         return;
165                                 }
166                                 if (c == '[') {
167                                         _token = LSB;
168                                         return;
169                                 }
170                                 if (c == ']') {
171                                         _token = RSB;
172                                         return;
173                                 }
174                                 if (c == '=') {
175                                         _token = EQ;
176                                         return;
177                                 }
178                                 if (c == '&') {
179                                         _token = AMP;
180                                         return;
181                                 }
182                                 if (c == '/') {
183                                         _token = SLASH;
184                                         return;
185                                 }
186                                 if (c == '?') {
187                                         _token = QM;
188                                         return;
189                                 }
190                                 if (c == '!') {
191                                         _token = EXCL;
192                                         return;
193                                 }
194                                 if (c == '-') {
195                                         _token = MINUS;
196                                         return;
197                                 }
198                                 if (isStartName(c)) {
199                                         _string = c;
200                                         state = 1;
201                                         break;
202                                 }
203                                 if (c == '"') {
204                                         _string.clear();
205                                         state = 2;
206                                         break;
207                                 }
208                                 if (c == '\'') {
209                                         _string.clear();
210                                         state = 3;
211                                         break;
212                                 }
213                                 if (c == -1) {
214                                         _token = EOI;
215                                         return;
216                                 }
217                                 _token = DATA;
218                                 return;
219
220                         case 1:
221                                 if (isName(c)) {
222                                         _string += c;
223                                         break;
224                                 }
225                                 _fs.unget();
226                                 _token = IDENT;
227                                 return;
228
229                         case 2:
230                                 if (c == '"') {
231                                         _token = STRING;
232                                         return;
233                                 }
234                                 if (c == '\n')
235                                         _line++;
236                                 _string += c;
237                                 break;
238
239                         case 3:
240                                 if (c == '\'') {
241                                         _token = STRING;
242                                         return;
243                                 }
244                                 if (c == '\n')
245                                         _line++;
246                                 _string += c;
247                                 break;
248                 }
249         }
250 }
251
252 void GraphmlGraphInput::compare(Token token)
253 {
254         if (_token == token)
255                 nextToken();
256         else
257                 error();
258 }
259
260 void GraphmlGraphInput::data()
261 {
262         while (1) {
263                 switch (_token) {
264                         case LT:
265                         case ERROR:
266                                 return;
267                         case EOI:
268                                 error();
269                                 return;
270                         default:
271                                 nextToken();
272                 }
273         }
274 }
275
276 void GraphmlGraphInput::dtdElement()
277 {
278         while (1) {
279                 switch (_token) {
280                         case GT:
281                                 nextToken();
282                                 return;
283                         case ERROR:
284                                 return;
285                         case EOI:
286                                 error();
287                                 return;
288                         default:
289                                 nextToken();
290                 }
291         }
292 }
293
294 void GraphmlGraphInput::dtdElementType()
295 {
296         switch (_token) {
297                 case MINUS:
298                         comment();
299                         break;
300                 case IDENT:
301                         dtdElement();
302                         break;
303                 default:
304                         error();
305         }
306 }
307
308 void GraphmlGraphInput::dtdData()
309 {
310         while (1) {
311                 switch (_token) {
312                         case LT:
313                                 nextToken();
314                                 compare(EXCL);
315                                 dtdElementType();
316                                 break;
317                         case RSB:
318                                 nextToken();
319                                 return;
320                         default:
321                                 error();
322                                 return;
323                 }
324         }
325 }
326
327 void GraphmlGraphInput::cdataData()
328 {
329         while (1) {
330                 switch (_token) {
331                         case RSB:
332                                 nextToken();
333                                 if (_token == RSB) {
334                                         nextToken();
335                                         if (_token == GT)
336                                                 return;
337                                 }
338                                 break;
339                         case ERROR:
340                                 return;
341                         case EOI:
342                                 error();
343                                 return;
344                         default:
345                                 nextToken();
346                 }
347         }
348 }
349
350 void GraphmlGraphInput::commentData()
351 {
352         int c, state = 0;
353
354         while (1) {
355                 c = _fs.get();
356
357                 if (c == '\n')
358                         _line++;
359
360                 switch (state) {
361                         case 0:
362                                 if (c == -1) {
363                                         error();
364                                         return;
365                                 }
366                                 if (c == '-')
367                                         state = 1;
368                                 break;
369
370                         case 1:
371                                 if (c == -1) {
372                                         error();
373                                         return;
374                                 }
375                                 if (c == '-')
376                                         state = 2;
377                                 else
378                                         state = 0;
379                                 break;
380
381                         case 2:
382                                 if (c == '>') {
383                                         _token = GT;
384                                         return;
385                                 }
386                                 error();
387                                 return;
388                 }
389         }
390 }
391
392 void GraphmlGraphInput::comment()
393 {
394         compare(MINUS);
395         if (_token != MINUS) {
396                 error();
397                 return;
398         }
399         commentData();
400         compare(GT);
401 }
402
403 void GraphmlGraphInput::cdata()
404 {
405         compare(LSB);
406         compare(IDENT);
407         compare(LSB);
408         cdataData();
409         compare(GT);
410 }
411
412 void GraphmlGraphInput::special()
413 {
414         switch (_token) {
415                 case MINUS:
416                         comment();
417                         break;
418                 case LSB:
419                         cdata();
420                         break;
421                 default:
422                         error();
423         }
424 }
425
426 void GraphmlGraphInput::attribute()
427 {
428         wstring attr, value;
429
430         attr = _string;
431         compare(IDENT);
432         compare(EQ);
433         value = _string;
434         compare(STRING);
435
436         if (_token == ERROR)
437                 return;
438
439         setAttribute(attr, value);
440 }
441
442 void GraphmlGraphInput::xmlAttributes()
443 {
444         while (1) {
445                 switch (_token) {
446                         case QM:
447                         case ERROR:
448                                 return;
449                         default:
450                                 attribute();
451                 }
452         }
453 }
454
455 bool GraphmlGraphInput::attributes()
456 {
457         clearAttributes();
458
459         while (1) {
460                 switch (_token) {
461                         case SLASH:
462                                 nextToken();
463                                 return true;
464                         case GT:
465                                 return false;
466                         case IDENT:
467                                 attribute();
468                                 break;
469                         default:
470                                 error();
471                                 return false;
472                 }
473         }
474 }
475
476 void GraphmlGraphInput::elementType(const wstring &parent)
477 {
478         switch (_token) {
479                 case SLASH:
480                         break;
481                 case IDENT:
482                         element(parent);
483                         break;
484                 case EXCL:
485                         nextToken();
486                         special();
487                         break;
488                 default:
489                         error();
490         }
491 }
492
493 void GraphmlGraphInput::nextItem(const wstring &parent)
494 {
495         switch (_token) {
496                 case LT:
497                         nextToken();
498                         elementType(parent);
499                         break;
500                 case EOI:
501                 case ERROR:
502                         break;
503                 default:
504                         data();
505                         compare(LT);
506                         break;
507         }
508 }
509
510 void GraphmlGraphInput::element(const wstring &parent)
511 {
512         bool closed;
513         wstring start, end;
514
515
516         start = _string;
517         compare(IDENT);
518         checkRelation(start, parent);
519         closed = attributes();
520         compare(GT);
521         if (_token == ERROR)
522                 return;
523
524         handleElement(start);
525
526         if (closed)
527                 return;
528
529         while (_token != ERROR && _token != SLASH)
530                 nextItem(start);
531
532         compare(SLASH);
533         end = _string;
534         compare(IDENT);
535         compare(GT);
536
537         if (start != end)
538                 error();
539 }
540
541 void GraphmlGraphInput::dtdMarkupDecl()
542 {
543         switch (_token) {
544                 case LSB:
545                         nextToken();
546                         dtdData();
547                         break;
548                 case GT:
549                         return;
550                 default:
551                         error();
552         }
553 }
554
555 void GraphmlGraphInput::dtdExternalId()
556 {
557         wstring type = _string;
558
559         switch (_token) {
560                 case LSB:
561                         break;
562                 case IDENT:
563                         nextToken();
564                         if (type == L"PUBLIC") {
565                                 compare(STRING);
566                                 compare(STRING);
567                         } else if (type == L"SYSTEM")
568                                 compare(STRING);
569                         else
570                                 error();
571                         break;
572                 default:
573                         error();
574         }
575 }
576
577 void GraphmlGraphInput::dtdDecl()
578 {
579         if (_string != L"DOCTYPE")
580                 error();
581         compare(IDENT);
582         compare(IDENT);
583         dtdExternalId();
584         dtdMarkupDecl();
585         compare(GT);
586 }
587
588 void GraphmlGraphInput::xmlDecl()
589 {
590         compare(QM);
591         if (!stringCaseCmp(_string, L"xml"))
592                 error();
593         compare(IDENT);
594         xmlAttributes();
595         compare(QM);
596         compare(GT);
597 }
598
599 void GraphmlGraphInput::prologComment()
600 {
601         switch (_token) {
602                 case EXCL:
603                         nextToken();
604                         comment();
605                         compare(LT);
606                         prologComment();
607                         break;
608                 case IDENT:
609                         break;
610                 default:
611                         error();
612         }
613 }
614
615 void GraphmlGraphInput::prologSpecial()
616 {
617         switch (_token) {
618                 case MINUS:
619                         comment();
620                         compare(LT);
621                         prologContent();
622                         break;
623                 case IDENT:
624                         dtdDecl();
625                         compare(LT);
626                         prologComment();
627                         break;
628                 default:
629                         error();
630         }
631 }
632
633 void GraphmlGraphInput::prologContent()
634 {
635         switch (_token) {
636                 case EXCL:
637                         nextToken();
638                         prologSpecial();
639                         break;
640                 case IDENT:
641                         break;
642                 default:
643                         error();
644         }
645 }
646
647 void GraphmlGraphInput::xml()
648 {
649         compare(LT);
650
651         switch (_token) {
652                 case QM:
653                         xmlDecl();
654                         compare(LT);
655                         prologContent();
656                         break;
657                 case EXCL:
658                         nextToken();
659                         prologSpecial();
660                         break;
661                 case IDENT:
662                         break;
663                 default:
664                         error();
665         }
666
667         setEncoding(_attributes.encoding);
668         element(L"");
669 }
670
671 bool GraphmlGraphInput::parse()
672 {
673         _line = 1;
674         _token = START;
675         _attributes.encoding = L"utf-8";
676
677         nextToken();
678         xml();
679
680         _vertexes.clear();
681
682         if (_token == EOI)
683                 return true;
684         else {
685                 error();
686                 return false;
687         }
688 }
689
690 IO::Error GraphmlGraphInput::readGraph(Graph *graph, const char *fileName,
691   Encoding *encoding)
692 {
693         IO::Error err = Ok;
694
695         _graph = graph;
696
697         _fs.open(fileName);
698         if (!_fs) {
699                 ioerr << fileName << ": " << strerror(errno) << endl;
700                 err = OpenError;
701         } else {
702                 if (!parse())
703                         err = (_fs.fail()) ? ReadError : FormatError;
704         }
705
706         _fs.close();
707         _fs.clear();
708
709         if (err)
710                 _graph->clear();
711
712         return err;
713 }