Commit 932f6f6e9d978d9c21198b0bc3f41baf968990f1

Add strigi utils.
  
1010
1111add_subdirectory(libstreams)
1212add_subdirectory(libstreamanalyzer)
13add_subdirectory(strigiutils)
1314add_subdirectory(strigidaemon)
1415add_subdirectory(strigiclient)
  
1project (strigiutils)
2
3##### cmake settings #####
4
5cmake_minimum_required(VERSION 2.6)
6set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
7#include(MacroCheckGccVisibility)
8#include(MacroFindOptionalDep)
9enable_testing()
10
11
12##### global variables #####
13
14
15##### environment inspection #####
16
17# check for required packages
18if(STRIGI_VERSION_STRING)
19 # if STRIGI_VERSION_STRING is defined, we are compiling the meta-package
20 set(LIBSTREAMS_INCLUDES
21 ../libstreams/include
22 ${CMAKE_CURRENT_BINARY_DIR}/../libstreams/include)
23 set(LIBSTREAMANALYZER_INCLUDES
24 ../libstreamanalyzer/include
25 ${CMAKE_CURRENT_BINARY_DIR}/../libstreamanalyzer/include)
26else(STRIGI_VERSION_STRING)
27 find_package(libstreams REQUIRED)
28 find_package(libstreamanalyzer REQUIRED)
29endif(STRIGI_VERSION_STRING)
30
31##### building and testing #####
32include_directories(${LIBSTREAMS_INCLUDES})
33include_directories(${LIBSTREAMANALYZER_INCLUDES})
34
35add_subdirectory(bin)
  
1add_subdirectory(deepfind)
2add_subdirectory(xmlindexer)
  
1if(NOT HAVE_REGEX_H)
2 set(REGEX_INCLUDE_DIR ${strigi_SOURCE_DIR}/src/streams/strigi/regex)
3 set(REGEX_SOURCES ${REGEX_INCLUDE_DIR}/regex.c)
4 # TODO: install copyright file !
5endif(NOT HAVE_REGEX_H)
6
7include_directories(
8 ../streamanalyzer
9 ../streams
10 ${REGEX_INCLUDE_DIR}
11 ${strigi_BINARY_DIR}/src/streams
12)
13
14add_executable(dummyindexer dummyindexer.cpp)
15target_link_libraries(dummyindexer streamanalyzer)
16
17if(BUILD_DEEPTOOLS)
18 add_executable(deepfind deepfind.cpp)
19 target_link_libraries(deepfind streamanalyzer)
20 install(TARGETS deepfind RUNTIME DESTINATION bin)
21endif (BUILD_DEEPTOOLS)
22
23add_library(grepindexer STATIC grepindexreader.cpp grepindexmanager.cpp
24 grepindexwriter.cpp ${REGEX_SOURCES})
25
26if(BUILD_DEEPTOOLS)
27 add_executable(deepgrep deepgrep.cpp)
28 target_link_libraries(deepgrep grepindexer streamanalyzer ${REGEX_LIBRARIES})
29 install(TARGETS deepgrep RUNTIME DESTINATION bin)
30endif (BUILD_DEEPTOOLS)
31
32add_executable(greptest grepindexreader.cpp)
33target_link_libraries(greptest streamanalyzer)
34
35if(NOT MSVC)
36 add_executable(analyzerlatencytester analyzerlatencytester.cpp)
37 target_link_libraries(analyzerlatencytester streamanalyzer)
38endif(NOT MSVC)
39
40add_library(grepindex STATIC grepindexmanager.cpp)
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2008 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21#include <strigi/strigiconfig.h>
22#include <strigi/indexmanager.h>
23#include <strigi/indexwriter.h>
24#include <strigi/diranalyzer.h>
25#include <strigi/analyzerconfiguration.h>
26#include <iostream>
27#include <map>
28#include <cassert>
29#include <cmath>
30#include <climits>
31#include <sys/time.h>
32#include <time.h>
33
34using namespace Strigi;
35using namespace std;
36
37float
38elapsed(const struct timeval& a, const struct timeval& b) {
39 return (float)(a.tv_sec - b.tv_sec)
40 + (float)(a.tv_usec - b.tv_usec) / 1.0e6f;
41}
42
43class LatencyMeasurer : public AnalyzerConfiguration {
44private:
45 class Private;
46 Private* const d;
47
48 // We implement this function so we can count the number of files analyzed.
49 bool indexFile(const char* path, const char* filename) const;
50 // This is the function that analyzers should call often to reduce latency.
51 // Here we always let the measurement continue so we can measure more.
52 bool indexMore() const;
53public:
54 LatencyMeasurer();
55 ~LatencyMeasurer();
56 void printReport();
57};
58
59class LatencyMeasurer::Private {
60public:
61 struct timeval starttime, lasttime;
62 int32_t numberOfChecks;
63 long numberOfFiles;
64 map<int, int> histogram;
65 string beforeLastFile;
66 struct timeval beforeLastTime;
67 string lastFile;
68 struct timeval lastTime;
69 Private() :numberOfChecks(0), numberOfFiles(0) {
70 starttime.tv_sec = -1;
71 }
72 void print();
73};
74// We implement this function so we can count the number of files analyzed.
75bool
76LatencyMeasurer::indexFile(const char* path, const char* filename) const {
77 d->beforeLastFile.assign(d->lastFile);
78 d->beforeLastTime = d->lastTime;
79 d->lastFile.assign(path);
80 gettimeofday(&d->lastTime, NULL);
81 d->numberOfFiles++;
82 return true;
83}
84// This is the function that analyzers should call often to reduce latency.
85// Here we always let the measurement continue so we can measure more.
86bool
87LatencyMeasurer::indexMore() const {
88 d->numberOfChecks++;
89 struct timeval now;
90 gettimeofday(&now, NULL);
91 if (d->starttime.tv_sec == -1) {
92 d->lasttime = d->starttime = now;
93 }
94 d->histogram[static_cast<int>(10*log10(elapsed(now, d->lasttime)))]++;
95 if (elapsed(now, d->lasttime) > 1) {
96 cerr << d->beforeLastFile << " started "
97 << elapsed(now, d->beforeLastTime) << " seconds ago." << endl;
98 cerr << d->lastFile << " started "
99 << elapsed(now, d->lastTime) << " seconds ago." << endl;
100 assert(elapsed(now, d->lasttime) < 3);
101 }
102 d->lasttime = now;
103 return true;
104}
105LatencyMeasurer::LatencyMeasurer() :d(new Private()) {
106}
107LatencyMeasurer::~LatencyMeasurer() {
108 delete d;
109}
110void
111LatencyMeasurer::printReport() {
112 d->print();
113}
114void
115LatencyMeasurer::Private::print() {
116 struct timeval now;
117 gettimeofday(&now, NULL);
118 cout << numberOfChecks << " checks in " << numberOfFiles << " files."
119 << endl;
120 cout << "On average " << (elapsed(now, starttime)/(float)numberOfChecks)
121 << " seconds between checks." << endl;
122 int smallestTime = INT_MAX;
123 int largestTime = INT_MIN;
124 double total = 0;
125 for (map<int,int>::const_iterator i = histogram.begin();
126 i != histogram.end(); ++i) {
127 int n = i->first;
128 total += pow(10.0, 0.1*n) * histogram[n];
129 if (n > largestTime && n < 1000) largestTime = n;
130 if (n < smallestTime && n > -1000) smallestTime = n;
131 }
132 double sum = 0;
133 for (int n=smallestTime; n<=largestTime; ++n) {
134 sum += pow(10.0,0.1*n) * histogram[n]/total;
135 cout << pow(10.0,0.1*n) << '\t' << 1-sum << endl;
136 }
137}
138
139class DummyWriter : public IndexWriter {
140private:
141 void startAnalysis(const AnalysisResult*) {}
142 void addText(const AnalysisResult*, const char*, int32_t) {}
143 void addValue(const AnalysisResult*, const RegisteredField*, const string&) {}
144 void addValue(const AnalysisResult*, const RegisteredField*, const unsigned char*, uint32_t) {}
145 void addValue(const AnalysisResult*, const RegisteredField*, int32_t) {}
146 void addValue(const AnalysisResult*, const RegisteredField*, uint32_t) {}
147 void addValue(const AnalysisResult*, const RegisteredField*, double) {}
148 void addValue(const AnalysisResult*, const RegisteredField*, const string&, const string&) {}
149 void finishAnalysis(const AnalysisResult*) {}
150 void addTriplet(const string&, const string&, const string&) {}
151 void deleteEntries(const vector<string>&) {}
152 void deleteAllEntries() {}
153};
154
155class DummyManager : public IndexManager {
156private:
157 DummyWriter dummywriter;
158 IndexReader* indexReader() { return 0; }
159 IndexWriter* indexWriter() { return &dummywriter; }
160};
161
162int
163main(int argc, char** argv) {
164 if (argc == 1) {
165 cerr << argv[0]
166 << " is a tool for testing the latency of the analyzers." << endl;
167 cerr << "Provide a directory to test on." << endl;
168 return 1;
169 }
170
171 LatencyMeasurer measurer;
172 DummyManager manager;
173 DirAnalyzer analyzer(manager, measurer);
174 int nthreads = 1;
175 for (int32_t i=1; i<argc; ++i) {
176 analyzer.analyzeDir(argv[i], nthreads);
177 }
178
179 measurer.printReport();
180 return 0;
181}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include <strigi/strigiconfig.h>
21#include "dummyindexwriter.h"
22#include <strigi/streamanalyzer.h>
23#include <strigi/analyzerconfiguration.h>
24#include <strigi/streamendanalyzer.h>
25#include <strigi/diranalyzer.h>
26using namespace Strigi;
27using namespace std;
28
29/**
30 * Special indexer that indexes only the filenames.
31 **/
32class FindIndexerConfiguration : public AnalyzerConfiguration {
33public:
34 bool useFactory(StreamEndAnalyzerFactory* e) const {
35 return e->analyzesSubStreams();
36 }
37 bool useFactory(StreamThroughAnalyzerFactory*) const {return false;}
38 bool indexMore() const {return true;}
39 bool addMoreText() const {return false;}
40 FieldType indexType(const string& fieldname) const {
41 return None;
42 }
43};
44
45void
46printUsage(char** argv) {
47 fprintf(stderr, "Usage: %s [dir-or-file-to-find]\n", argv[0]);
48}
49bool
50containsHelp(int argc, char **argv) {
51 for (int i=1; i<argc; ++i) {
52 if (strcmp(argv[i], "--help") == 0
53 || strcmp(argv[i], "-h") == 0) return true;
54 }
55 return false;
56}
57
58int
59main(int argc, char **argv) {
60 const char* path = ".";
61 if (containsHelp(argc, argv) || argc > 2) {
62 printUsage(argv);
63 return -1;
64 }
65 if (argc == 2) {
66 path = argv[1];
67 }
68
69 DummyIndexManager manager(1);
70 FindIndexerConfiguration conf;
71 DirAnalyzer analyzer(manager, conf);
72 analyzer.analyzeDir(path, 1);
73 return 0;
74}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include <stdio.h>
21
22#include <strigi/strigiconfig.h>
23#include "grepindexmanager.h"
24#include <strigi/diranalyzer.h>
25#include <strigi/analyzerconfiguration.h>
26#include <iostream>
27#include <cstring>
28using namespace Strigi;
29using namespace std;
30
31void
32printUsage(char** argv) {
33 fprintf(stderr, "Usage: %s [--fields] [--help] PATTERN [dir-or-file-to-grep]\n"
34 " --fields print the list of fields\n"
35 " --help print this help screen\n",
36 argv[0]);
37}
38bool
39containsArgument(int argc, char **argv, const char* arg, const char* a=0) {
40 for (int i=1; i<argc; ++i) {
41 if (strcmp(argv[i], arg) == 0
42 || (a && strcmp(argv[i], a) == 0)) return true;
43 }
44 return false;
45}
46bool
47containsHelp(int argc, char **argv) {
48 return containsArgument(argc, argv, "--help", "-h");
49}
50bool
51containsFieldList(int argc, char **argv) {
52 return containsArgument(argc, argv, "--fields", "-f");
53}
54
55void
56printFields(AnalyzerConfiguration& conf) {
57 const map<string, RegisteredField*>& fields
58 = conf.fieldRegister().fields();
59 map<string, RegisteredField*>::const_iterator i;
60 for (i = fields.begin(); i != fields.end(); ++i) {
61 cout << i->first << endl;
62 }
63}
64
65int
66main(int argc, char** argv) {
67 AnalyzerConfiguration ic;
68 if (containsFieldList(argc, argv)) {
69 printFields(ic);
70 return 0;
71 }
72 if (containsHelp(argc, argv) || argc < 2) {
73 printUsage(argv);
74 return -1;
75 }
76 GrepIndexManager manager(argv[1]);
77
78 DirAnalyzer analyzer(manager, ic);
79 int nthreads = 8;
80 if (argc > 2) {
81 for (int32_t i=2; i<argc; ++i) {
82 analyzer.analyzeDir(argv[i], nthreads);
83 }
84 } else {
85 analyzer.analyzeDir(".", nthreads);
86 }
87 return 0;
88}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "dummyindexwriter.h"
21#include <strigi/strigiconfig.h>
22#include <strigi/diranalyzer.h>
23#include <strigi/analyzerconfiguration.h>
24
25#include <stdlib.h>
26
27void
28printUsage(char** argv) {
29 fprintf(stderr, "Usage: %s [-v verbosity] [dir-to-index]\n", argv[0]);
30}
31
32int
33main(int argc, char **argv) {
34 if (argc != 2 && argc != 4) {
35 printUsage(argv);
36 return -1;
37 }
38 int verbosity = 0;
39 if (argc == 4) {
40 if (std::strcmp("-v", argv[1])) {
41 printUsage(argv);
42 return -1;
43 }
44 verbosity = atoi(argv[2]);
45 }
46
47 DummyIndexManager manager(verbosity);
48 Strigi::AnalyzerConfiguration ic;
49 Strigi::DirAnalyzer analyzer(manager, ic);
50 analyzer.analyzeDir(argv[argc-1]);
51 return 0;
52}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef DUMMYINDEXWRITER_H
21#define DUMMYINDEXWRITER_H
22
23#include <strigi/analysisresult.h>
24#include <strigi/indexwriter.h>
25#include <strigi/indexmanager.h>
26#include <strigi/fieldtypes.h>
27#include <cstring>
28
29class DummyIndexWriter : public Strigi::IndexWriter {
30private:
31 int verbosity;
32protected:
33 void startAnalysis(const Strigi::AnalysisResult* ar) {
34 if (verbosity >= 1) {
35 printf("%s\n", ar->path().c_str());
36 }
37 if (verbosity == -1) { // sha1 mode
38 std::string* s = new std::string();
39 ar->setWriterData(s);
40 }
41 }
42 void finishAnalysis(const Strigi::AnalysisResult* ar) {
43 if (verbosity == -1) { // sha1 mode
44 const std::string* s = static_cast<const std::string*>(
45 ar->writerData());
46 printf("%s\t%s\n", ar->path().c_str(), s->c_str());
47 delete s;
48 }
49 }
50 void addText(const Strigi::AnalysisResult* ar, const char* text,
51 int32_t length) {
52 if (verbosity > 2) {
53 printf("%s: addText '%.*s'\n", ar->path().c_str(), length,
54 text);
55 }
56 }
57 void addValue(const Strigi::AnalysisResult* ar,
58 const Strigi::RegisteredField* field, const std::string& value) {
59 if (verbosity > 1) {
60 printf("%s: setField '%s': '%s'\n", ar->path().c_str(),
61 field->key().c_str(), value.c_str());
62 } else if (verbosity == -1
63 && std::strcmp(field->key().c_str(), "sha1") == 0) {
64 std::string* s = static_cast<std::string*>(ar->writerData());
65 *s = value;
66 }
67 }
68 void addValue(const Strigi::AnalysisResult* ar,
69 const Strigi::RegisteredField* fieldname, const unsigned char* data,
70 uint32_t size) {}
71 void addValue(const Strigi::AnalysisResult* ar,
72 const Strigi::RegisteredField* fieldname, uint32_t value) {}
73 void addValue(const Strigi::AnalysisResult* ar,
74 const Strigi::RegisteredField* fieldname, int32_t value) {}
75 void addValue(const Strigi::AnalysisResult* ar,
76 const Strigi::RegisteredField* fieldname, double value) {}
77 void addTriplet(const std::string& subject,
78 const std::string& predicate, const std::string& object) {}
79 void addValue(const Strigi::AnalysisResult*,
80 const Strigi::RegisteredField* field, const std::string& name,
81 const std::string& value) {}
82public:
83 DummyIndexWriter(int v = 0) {
84 verbosity = v;
85 }
86 ~DummyIndexWriter() {}
87 void commit() {}
88 void deleteEntries(const std::vector<std::string>& entries) {}
89 void deleteAllEntries() {}
90};
91
92class DummyIndexManager : public Strigi::IndexManager {
93private:
94 DummyIndexWriter writer;
95public:
96 DummyIndexManager(int level) :writer(level) {}
97 Strigi::IndexWriter* indexWriter() {
98 return &writer;
99 }
100 Strigi::IndexReader* indexReader() {
101 return 0;
102 }
103};
104
105#endif
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "grepindexmanager.h"
21#include "grepindexreader.h"
22#include "grepindexwriter.h"
23#include <strigi/indexwriter.h>
24using namespace Strigi;
25
26GrepIndexManager::GrepIndexManager(const char* regex)
27 :reader(0),
28 writer(new GrepIndexWriter(regex)){
29}
30GrepIndexManager::~GrepIndexManager() {
31 delete writer;
32}
33Strigi::IndexReader*
34GrepIndexManager::indexReader() {
35 return reader;
36}
37Strigi::IndexWriter*
38GrepIndexManager::indexWriter() {
39 return writer;
40}
41Strigi::IndexManager*
42createGrepIndexManager(const char* path) {
43 return new GrepIndexManager(path);
44}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef GREPINDEXMANAGER_H
21#define GREPINDEXMANAGER_H
22
23#include <strigi/strigiconfig.h>
24#include <strigi/indexmanager.h>
25
26class GrepIndexManager : public Strigi::IndexManager {
27private:
28 Strigi::IndexReader* const reader;
29 Strigi::IndexWriter* const writer;
30public:
31 explicit GrepIndexManager(const char* regex);
32 ~GrepIndexManager();
33
34 Strigi::IndexReader* indexReader();
35 Strigi::IndexWriter* indexWriter();
36};
37
38Strigi::IndexManager*
39createGrepIndexManager(const char* path);
40
41#endif
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "grepindexreader.h"
21#include <strigi/analyzerconfiguration.h>
22#include <strigi/query.h>
23#include <strigi/variant.h>
24#include <strigi/indexwriter.h>
25#include <strigi/filelister.h>
26#include <set>
27using namespace std;
28using namespace Strigi;
29
30/**
31 * Custom configuration that extracts specific fields.
32 **/
33class FieldAnalyzerConfiguration : public AnalyzerConfiguration {
34private:
35 const set<string> neededFields;
36 mutable set<string> availableFields;
37 signed char m_maxDepth;
38 const bool needsAllFields;
39public:
40 FieldAnalyzerConfiguration(const set<string>& fields);
41 bool useFactory(StreamAnalyzerFactory*) const;
42 bool useFactory(StreamEndAnalyzerFactory*) const;
43 bool hasAllFields() const;
44};
45FieldAnalyzerConfiguration::FieldAnalyzerConfiguration(const set<string>& f)
46 :neededFields(f), m_maxDepth(-1), needsAllFields(f.find("")!=f.end()) {
47}
48bool
49FieldAnalyzerConfiguration::useFactory(StreamAnalyzerFactory* f) const {
50 bool use = needsAllFields;
51 const vector<const RegisteredField*>& fields = f->registeredFields();
52 vector<const RegisteredField*>::const_iterator i;
53 for (i = fields.begin(); i != fields.end(); ++i) {
54 const RegisteredField* field = *i;
55 do {
56 if (neededFields.find(field->key()) != neededFields.end()) {
57 availableFields.insert(field->key());
58 use = true;
59 }
60 field = field->parent();
61 } while (field);
62 }
63 return use;
64}
65bool
66FieldAnalyzerConfiguration::useFactory(StreamEndAnalyzerFactory* f) const {
67 return f->analyzesSubStreams() || static_cast<StreamAnalyzerFactory*>(f);
68}
69
70class QueryIndexWriter : public IndexWriter {
71public:
72 void startAnalysis(const AnalysisResult*) {}
73 void addText(const AnalysisResult* result, const char* text, int32_t length) {}
74 void addValue(const AnalysisResult* result, const RegisteredField* field,
75 const std::string& value) {}
76 void addValue(const AnalysisResult* result, const RegisteredField* field,
77 const unsigned char* data, uint32_t size) {}
78 void addValue(const AnalysisResult* result, const RegisteredField* field,
79 int32_t value) {}
80 void addValue(const AnalysisResult* result, const RegisteredField* field,
81 uint32_t value) {}
82 void addValue(const AnalysisResult* result, const RegisteredField* field,
83 double value) {}
84 void addValue(const AnalysisResult* result, const RegisteredField* field,
85 const std::string& name, const std::string& value) {}
86 void finishAnalysis(const AnalysisResult* result) {}
87 void addTriplet(const std::string& subject,
88 const std::string& predicate, const std::string& object) {}
89public:
90 void commit() { return; }
91 void deleteEntries(const std::vector<std::string>& entries) {}
92 void deleteAllEntries() {}
93 /**
94 * @brief Return the number of objects that are currently in the cache.
95 **/
96 virtual int itemsInCache() { return 0; }
97 void optimize() {}
98 void initWriterData(const Strigi::FieldRegister& fieldRegister) {}
99 void releaseWriterData(const Strigi::FieldRegister& fieldRegister) {}
100};
101
102class GrepIndexReader::Private {
103public:
104 const string dir;
105
106 Private(const string& d) :dir(d) {}
107};
108
109GrepIndexReader::GrepIndexReader(const string& dir) :p(new Private(dir)) {
110}
111GrepIndexReader::~GrepIndexReader() {
112 delete p;
113}
114void
115getFields(set<string>& fields, const Query& query) {
116 copy(query.fields().begin(), query.fields().end(),
117 inserter(fields, fields.begin()));
118 for (vector<Query>::const_iterator i = query.subQueries().begin();
119 i != query.subQueries().end(); ++i) {
120 getFields(fields, *i);
121 }
122}
123int32_t
124GrepIndexReader::countHits(const Query& query) {
125 QueryIndexWriter qiw;
126 // make an analyzerconfiguration with a limited set of fields
127 set<string> fields;
128 getFields(fields, query);
129 FieldAnalyzerConfiguration conf(fields);
130 StreamAnalyzer analyzer(conf);
131 analyzer.setIndexWriter(qiw);
132 return -1;
133}
134vector<IndexedDocument>
135GrepIndexReader::query(const Query&, int offset, int max) {
136 vector<IndexedDocument> hits;
137 return hits;
138}
139void
140GrepIndexReader::getHits(const Strigi::Query&,
141 const std::vector<std::string>& fields,
142 const std::vector<Strigi::Variant::Type>& types,
143 std::vector<std::vector<Strigi::Variant> >& result, int off, int max) {
144 result.clear();
145}
146map<string, time_t>
147GrepIndexReader::files(char depth) {
148 map<string, time_t> files;
149 return files;
150}
151int32_t
152GrepIndexReader::countDocuments() { return -1; }
153int32_t
154GrepIndexReader::countWords() { return -1; }
155int64_t
156GrepIndexReader::indexSize() {
157 // we have no index :-)
158 return 0;
159}
160/**
161 * This does not have to be implemented since we have not index.
162 **/
163time_t
164GrepIndexReader::mTime(const std::string& uri) {
165 return -1;
166}
167vector<string>
168GrepIndexReader::fieldNames() {
169 vector<string> fieldnames;
170 return fieldnames;
171}
172vector<pair<string,uint32_t> >
173GrepIndexReader::histogram(const string& query, const string& fieldname,
174 const string& labeltype) {
175 vector<pair<string,uint32_t> > histogram;
176 return histogram;
177}
178int32_t
179GrepIndexReader::countKeywords(const string& keywordprefix,
180 const vector<string>& fieldnames) {
181 return -1;
182}
183vector<string>
184GrepIndexReader::keywords(
185 const string& keywordmatch,
186 const vector<string>& fieldnames,
187 uint32_t max, uint32_t offset) {
188 vector<string> keywords;
189 return keywords;
190}
191int
192main() {
193 GrepIndexReader("/home");
194 return 0;
195}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef GREPINDEXREADER_H
21#define GREPINDEXREADER_H
22
23#include <strigi/strigiconfig.h>
24#include <strigi/indexreader.h>
25
26#include <map>
27#include <time.h>
28
29class GrepIndexReader : public Strigi::IndexReader {
30private:
31 class Private;
32 Private* const p;
33public:
34 GrepIndexReader(const std::string& dir);
35 ~GrepIndexReader();
36 int32_t countHits(const Strigi::Query& query);
37 std::vector<Strigi::IndexedDocument> query(const Strigi::Query&, int offset,
38 int max);
39 void getHits(const Strigi::Query&, const std::vector<std::string>& fields,
40 const std::vector<Strigi::Variant::Type>& types,
41 std::vector<std::vector<Strigi::Variant> >& result, int off, int max);
42 std::map<std::string, time_t> files(char depth);
43 int32_t countDocuments();
44 int32_t countWords();
45 int64_t indexSize();
46 time_t mTime(const std::string& uri);
47 std::vector<std::string> fieldNames();
48 std::vector<std::pair<std::string,uint32_t> > histogram(
49 const std::string& query, const std::string& fieldname,
50 const std::string& labeltype);
51 int32_t countKeywords(const std::string& keywordprefix,
52 const std::vector<std::string>& fieldnames);
53 std::vector<std::string> keywords(
54 const std::string& keywordmatch,
55 const std::vector<std::string>& fieldnames,
56 uint32_t max, uint32_t offset);
57};
58
59#endif
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "grepindexwriter.h"
21#include <strigi/analysisresult.h>
22#include <strigi/fieldtypes.h>
23#include <regex.h>
24using namespace std;
25
26class GrepIndexWriter::Private
27{
28public:
29 Private() {}
30 regex_t regex;
31};
32
33GrepIndexWriter::GrepIndexWriter(const char* re)
34 : d(new Private()) {
35 regcomp(&d->regex, re, REG_NOSUB);
36}
37GrepIndexWriter::~GrepIndexWriter() {
38 regfree(&d->regex);
39 delete d;
40}
41void
42GrepIndexWriter::startAnalysis(const Strigi::AnalysisResult* idx) {
43}
44void
45GrepIndexWriter::finishAnalysis(const Strigi::AnalysisResult* idx) {
46}
47void
48GrepIndexWriter::addText(const Strigi::AnalysisResult* idx, const char* text,
49 int32_t length) {
50 // unfortunately we have to copy the incoming stream because regexec()
51 // assumes a null-terminated string and we are not allowed to modify the
52 // incoming message
53 string s;
54 const char* start = text;
55 const char* end = text+length;
56 const char* p = start;
57 while (p < end) {
58 // look at each line separately
59 if (*p == '\n' || *p == '\r') {
60 s.assign(start, p-start);
61 if (regexec(&d->regex, s.c_str(), 0, 0, 0) == 0) {
62 printf("%s:%s\n", idx->path().c_str(), s.c_str());
63 }
64 start = p+1;
65 }
66 p++;
67 }
68 s.assign(start, p-start);
69 if (regexec(&d->regex, s.c_str(), 0, 0, 0) == 0) {
70 printf("%s:%s\n", idx->path().c_str(), s.c_str());
71 }
72}
73void
74GrepIndexWriter::addValue(const Strigi::AnalysisResult* idx,
75 const Strigi::RegisteredField* field, const std::string& value) {
76 if (regexec(&d->regex, value.c_str(), 0, 0, 0) == 0) {
77 printf("%s:%s:%s\n", idx->path().c_str(),
78 field->key().c_str(), value.c_str());
79 }
80}
81void
82GrepIndexWriter::addValue(const Strigi::AnalysisResult* idx,
83 const Strigi::RegisteredField* field,
84 const unsigned char* data, uint32_t size) {
85 if (!field->properties().binary()) {
86 string value((const char*)data, size);
87 addValue(idx, field, value);
88 }
89}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef GREPINDEXWRITER_H
21#define GREPINDEXWRITER_H
22
23#include <strigi/indexwriter.h>
24
25class GrepIndexWriter : public Strigi::IndexWriter {
26private:
27 class Private;
28 Private * const d;
29protected:
30 void startAnalysis(const Strigi::AnalysisResult* idx);
31 void finishAnalysis(const Strigi::AnalysisResult* idx);
32 void addText(const Strigi::AnalysisResult* idx, const char* text,
33 int32_t length);
34 void addValue(const Strigi::AnalysisResult* idx,
35 const Strigi::RegisteredField* field, const std::string& value);
36 void addValue(const Strigi::AnalysisResult* idx,
37 const Strigi::RegisteredField* field,
38 const unsigned char* data, uint32_t size);
39 void addValue(const Strigi::AnalysisResult* idx,
40 const Strigi::RegisteredField* field, uint32_t value) {}
41 void addValue(const Strigi::AnalysisResult* idx,
42 const Strigi::RegisteredField* field, int32_t value) {}
43 void addValue(const Strigi::AnalysisResult* idx,
44 const Strigi::RegisteredField* field, double value) {}
45 void addTriplet(const std::string& subject,
46 const std::string& predicate, const std::string& object) {}
47 void addValue(const Strigi::AnalysisResult*,
48 const Strigi::RegisteredField* field, const std::string& name,
49 const std::string& value) {}
50public:
51 explicit GrepIndexWriter(const char* re);
52 ~GrepIndexWriter();
53 void commit() {}
54 void deleteEntries(const std::vector<std::string>& entries) {}
55 void deleteAllEntries() {}
56};
57
58#endif
  
1set term png
2set logscale x
3set output 'x.png'
4set xlabel 'exit time (s)'
5set ylabel 'chance that exit time is larger'
6set style line 1 lt 1 lw 6
7unset key
8plot 'graph'
  
1include_directories(
2 ../streamanalyzer
3 ../streams
4 ../streams/strigi
5 ${strigi_BINARY_DIR}/src/streams
6 ${BZIP2_INCLUDE_DIR}
7)
8
9add_executable(rdfindexer
10 rdfindexer.cpp
11 tagmapping.cpp
12 rdfindexwriter.cpp
13)
14
15target_link_libraries(rdfindexer streamanalyzer)
16
17install(TARGETS rdfindexer RUNTIME DESTINATION bin)
18
19add_executable(xmlindexer
20 xmlindexer.cpp
21 tagmapping.cpp
22 xmlindexwriter.cpp
23)
24
25target_link_libraries(xmlindexer streamanalyzer)
26
27add_executable(cgixmlindexer
28 tagmapping.cpp
29 xmlindexwriter.cpp
30 cgixmlindexer.cpp)
31
32target_link_libraries(cgixmlindexer streamanalyzer)
33
34install(TARGETS xmlindexer RUNTIME DESTINATION bin)
35
36#add_executable(peranalyzerxml peranalyzerxml.cpp xmlindexwriter.cpp
37# tagmapping.cpp)
38#target_link_libraries(peranalyzerxml streamanalyzer)
39
40add_executable(perfieldxml perfieldxml.cpp xmlindexwriter.cpp
41 tagmapping.cpp)
42target_link_libraries(perfieldxml streamanalyzer)
43
44# is this still broken on win32?
45if(NOT WIN32)
46 add_executable(ontoprint ontoprint.cpp)
47 target_link_libraries(ontoprint streamanalyzer)
48endif(NOT WIN32)
49
50# register all tests based on the data in the testdata directory
51#FILE(GLOB_RECURSE allfiles ../../testdata/analyzers/*/config)
52FOREACH(file ${allfiles})
53 GET_FILENAME_COMPONENT(testdir ${file} PATH)
54 GET_FILENAME_COMPONENT(dir ${testdir} NAME)
55 FILE(GLOB_RECURSE ofiles "${testdir}/*")
56 FOREACH(ofile ${ofiles})
57 STRING(REPLACE "/analyzers/${dir}/" "/data/" ifile ${ofile})
58 IF(NOT ${ifile} MATCHES "/.svn/" AND NOT ${ifile} MATCHES "config$")
59 STRING(REPLACE "*/" "" testname ${ofile})
60 STRING(REPLACE ${testdir} "" testname ${ofile})
61 ADD_TEST("${dir}${testname}" peranalyzerxml -c ${testdir}/config
62 -r ${ofile} ${ifile})
63 ENDIF(NOT ${ifile} MATCHES "/.svn/" AND NOT ${ifile} MATCHES "config$")
64 ENDFOREACH(ofile ${ofiles})
65ENDFOREACH(file ${allfiles})
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21#include "xmlindexwriter.h"
22#include <strigi/stringstream.h>
23#include <strigi/stringterminatedsubstream.h>
24#include <strigi/subinputstream.h>
25#include <time.h>
26#include <iostream>
27#include <cstdlib>
28#include <stdlib.h> // getenv
29using namespace Strigi;
30using namespace std;
31
32string
33readHeader(InputStream* f) {
34 StringTerminatedSubStream header(f, "\r\n\r\n");
35 string h;
36 const char* d;
37 int32_t nread = header.read(d, 1000, 0);
38 while (nread > 0) {
39 h.append(d, nread);
40 nread = header.read(d, 1000, 0);
41 }
42 return h;
43}
44
45/**
46 * Start parsing a file. The stream must be positioned at the start of the file
47 * header.
48 **/
49bool
50parseFile(StreamAnalyzer& sa, XmlIndexManager& manager,
51 InputStream* f, const string& delim) {
52
53 string header = readHeader(f);
54 string filename;
55 const char* start = header.c_str();
56 start = strstr(start, "filename=");
57 if (start) {
58 start += 9;
59 const char* end = 0;
60 char c = *start;
61 if (c == '\'' || c == '"') {
62 start += 1;
63 end = strchr(start, c);
64 }
65 if (end) {
66 filename.assign(start, end-start);
67 } else {
68 filename.assign(start);
69 }
70 }
71
72 // analyzer the stream
73 StringTerminatedSubStream stream(f, delim);
74 if (filename.size()) {
75 AnalysisResult result(filename, time(0), *manager.indexWriter(), sa);
76 sa.analyze(result, &stream);
77 }
78 // read the rest of the stream
79 const char* d;
80 int32_t nread = stream.read(d, 1000, 0);
81 while (nread > 0) {
82 nread = stream.read(d, 1000, 0);
83 }
84
85 // check if this is the last file
86 nread = f->read(d, 2, 2);
87 return nread == 2 && *d == '\r' && d[1] == '\n';
88}
89
90int
91main() {
92 const TagMapping mapping(0);
93 cout << "Content-Type:text/xml;charset=UTF-8\r\n\r\n"
94 "<?xml version='1.0' encoding='UTF-8'?>\n<"
95 << mapping.map("metadata") << ">\n";
96
97 int len;
98 const char* lenstr = getenv("CONTENT_LENGTH");
99 if (lenstr == NULL || sscanf(lenstr,"%id", &len) != 1 || len < 0) {
100 cout << " <error>input too small</error>\n</"
101 << mapping.map("metadata") << ">\n" << flush;
102 return 0;
103 }
104 cerr << "len " << len << endl;
105 char* e = new char[len];
106 if (e == 0 || fread(e, 1, len, stdin) != (size_t)len) {
107 cout << " <error>cannot allocate memory</error>\n</"
108 << mapping.map("metadata") << ">\n" << flush;
109 return 0;
110 }
111
112 // read from stdin
113 StringInputStream stream(e, len);
114
115 // read the first line
116 const char* d = NULL;
117 const int32_t maxlength = 1024;
118 int32_t nread = stream.read(d, maxlength, maxlength);
119 stream.reset(0);
120
121 if (nread < 1) {
122 cout << " <error>input too small</error>\n</"
123 << mapping.map("metadata") << ">\n" << flush;
124 return 0;
125 }
126
127 // get out the delimiter
128 const char* end = d + nread;
129 const char* p = d;
130 while (p < end-1 && *p != '\r') p++;
131 if (*p != '\r' || p[1] != '\n') {
132 cout << " <error>no delimiter line</error></"
133 << mapping.map("metadata") << ">\n" << flush;
134 return 0;
135 }
136 string delim("\r\n");
137 delim.append(d, p-d);
138
139 // skip the delimiter + '\r\n'
140 stream.reset(delim.length() + 2);
141
142 // parse all files
143 XmlIndexManager manager(cout, mapping);
144 AnalyzerConfiguration ac;
145 StreamAnalyzer sa(ac);
146 sa.setIndexWriter(*manager.indexWriter());
147 while (parseFile(sa, manager, &stream, delim)) {};
148 cout << "</" << mapping.map("metadata") << ">\n" << flush;
149
150 return 0;
151}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include <iostream>
21#include <cstdlib>
22#include <cstring>
23#include <string>
24#include <list>
25#include <algorithm>
26#include <unistd.h>
27#include <getopt.h>
28#include <strigi/fieldpropertiesdb.h>
29#include <strigi/streamanalyzer.h>
30#include <strigi/analyzerconfiguration.h>
31using namespace std;
32using namespace Strigi;
33
34void
35printDot(ostream& out, const char* locale) {
36 const map<string, FieldProperties>& p
37 = FieldPropertiesDb::db().allProperties();
38 map<string, FieldProperties>::const_iterator i;
39 list<string> categories;
40 out << "digraph{graph[rankdir=LR];" << endl;
41 for (i = p.begin(); i != p.end(); ++i) {
42 const vector<string>& parents = i->second.parentUris();
43 vector<string>::const_iterator j;
44 out << '"' << i->second.uri() << "\" [shape=record, label =\"" << i->second.uri() << "|{type: " << i->second.typeUri() << "}\"];" << endl;
45 for (j = parents.begin(); j != parents.end(); ++j) {
46 out << '"' << *j << "\"->\"" << i->second.uri() << "\";" << endl;
47 }
48 // make link to category, e.g. chemistry for chemistry.inchi
49 string category = i->second.uri().substr(0,i->second.uri().find("."));
50 if (category.length() != i->second.uri().length()) {
51 list<string>::const_iterator match = find(categories.begin(), categories.end(), category);
52 if (match == categories.end()) {
53 categories.push_back(category);
54 out << "\"" << category << "\" [style=filled,color=gray];" << endl;
55 }
56 out << "\"" << category << "\"->\"" << i->second.uri() << "\";" << endl;
57 }
58 }
59 out << "}" << endl;
60}
61void
62printRdfsProperties(ostream& out, const FieldProperties& p) {
63 out << " <rdf:Property rdf:about='" << p.uri() << "'>\n"
64 << " <rdfs:label>" << p.name() << "</rdfs:label>\n"
65 << " <rdfs:comment>" << p.description() << "</rdfs:comment>\n";
66 const vector<string>& parents = p.parentUris();
67 vector<string>::const_iterator j;
68 for (j = parents.begin(); j != parents.end(); ++j){
69 out << " <rdfs:subPropertyOf rdf:resource='" << *j << "'/>\n";
70 }
71 const vector<string>& classes = p.applicableClasses();
72 for (j = classes.begin(); j != classes.end(); ++j){
73 out << " <rdfs:domain rdf:resource='" << *j << "'/>\n";
74 }
75
76 out << " <rdfs:range rdf:resource='" << p.typeUri() << "'/>\n";
77 const vector<string>& locales = p.locales();
78 for (j = locales.begin(); j != locales.end(); ++j) {
79 const string& name = p.localizedName(*j);
80 if (name.size()) {
81 out << " <rdfs:label xml:lang='" << *j << "'>" << name << "</rdfs:label>\n";
82 }
83 const string& description = p.localizedDescription(*j);
84 if (description.size()) {
85 out << " <rdfs:comment xml:lang='" << *j << "'>" << description << "</rdfs:comment>\n";
86 }
87 }
88 out << " </rdf:Property>\n";
89}
90
91void
92printRdfsClasses(ostream& out, const ClassProperties& p) {
93 out << " <rdfs:Class rdf:about='" << p.uri() << "'>\n"
94 << " <rdfs:label>" << p.name() << "</rdfs:label>\n"
95 << " <rdfs:comment>" << p.description() << "</rdfs:comment>\n";
96 const vector<string>& parents = p.parentUris();
97 vector<string>::const_iterator j;
98 for (j = parents.begin(); j != parents.end(); ++j){
99 out << " <rdfs:subClassOf rdf:resource='" << *j << "'/>\n";
100 }
101
102 const vector<string>& locales = p.locales();
103 for (j = locales.begin(); j != locales.end(); ++j) {
104 const string& name = p.localizedName(*j);
105 if (name.size()) {
106 out << " <rdfs:label xml:lang='" << *j << "'>" << name << "</rdfs:label>\n";
107 }
108 const string& description = p.localizedDescription(*j);
109 if (description.size()) {
110 out << " <rdfs:comment xml:lang='" << *j << "'>" << description << "</rdfs:comment>\n";
111 }
112 }
113 out << " </rdfs:Class>\n";
114}
115
116void
117printRdfs(ostream& out) {
118 out << "<?xml version='1.0' encoding='UTF-8'?>\n"
119 "<!DOCTYPE rdf:RDF [\n"
120 " <!ENTITY rdf 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'>\n"
121 " <!ENTITY strigi 'http://strigi URL goes here#'>\n"
122 " <!ENTITY rdfs 'http://www.w3.org/2000/01/rdf-schema#'>\n"
123 "]>\n"
124 "<rdf:RDF "
125 "xmlns:rdf='&rdf;' xmlns:strigi='&strigi;' xmlns:rdfs='&rdfs;'>\n";
126
127 const map<string, FieldProperties>& p
128 = FieldPropertiesDb::db().allProperties();
129 map<string, FieldProperties>::const_iterator i;
130 for (i = p.begin(); i != p.end(); ++i) {
131 printRdfsProperties(out, i->second);
132 }
133
134 const map<string, ClassProperties>& c
135 = FieldPropertiesDb::db().allClasses();
136 map<string, ClassProperties>::const_iterator j;
137 for (j = c.begin(); j != c.end(); ++j) {
138 printRdfsClasses(out, j->second);
139 }
140
141 out << "</rdf:RDF>" << endl;
142}
143void
144printHelp(const char* program) {
145 cerr << "Usage: " << program << " [--type=<type>] [--locale=<locale>]"
146 << endl;
147}
148int
149main(int argc, char** argv) {
150 struct option long_options[] = {
151 {"help", no_argument, 0, 0},
152 {"type", required_argument, 0, 0},
153 {"locale", required_argument, 0, 0}
154 };
155 const char* type = 0;
156 const char* locale = 0;
157 bool help = false;
158 while (1) {
159 int optindex;
160 int c = getopt_long(argc, argv, "", long_options, &optindex);
161 if (c == -1) break;
162 if (c == 0) {
163 if (optindex == 0) help = true;
164 if (optindex == 1) type = optarg;
165 if (optindex == 2) locale = optarg;
166 }
167 switch (c) {
168 case '?':
169 printHelp(argv[0]);
170 exit(1);
171 default:
172 break;
173 }
174 }
175
176 // load the plugins
177 AnalyzerConfiguration ac;
178 StreamAnalyzer s(ac);
179
180 if (help) {
181 printHelp(argv[0]);
182 } else if (type && strcmp(type, "dot") == 0) {
183 printDot(cout, locale);
184 } else {
185 printRdfs(cout);
186 }
187 return 0;
188}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifdef HAVE_CONFIG_H
21 #include <config.h>
22#endif
23
24#include <strigi/strigiconfig.h>
25//#include "compat.h"
26#include <strigi/fileinputstream.h>
27#include <strigi/bz2inputstream.h>
28#include <strigi/diranalyzer.h>
29#include <strigi/analyzerconfiguration.h>
30#include <strigi/streamendanalyzer.h>
31#include <strigi/streamthroughanalyzer.h>
32#include <strigi/streamlineanalyzer.h>
33#include <strigi/streamsaxanalyzer.h>
34#include <strigi/streameventanalyzer.h>
35#include "xmlindexwriter.h"
36
37#include <cstdio>
38#include <cstring>
39#include <cerrno>
40#include <algorithm>
41#ifdef HAVE_UNISTD_H
42 #include <unistd.h>
43#endif
44#include <stdlib.h>
45#ifdef HAVE_DIRECT_H
46 #include <direct.h>
47#endif
48#include <iostream>
49#include <sstream>
50#include <fstream>
51#include <set>
52using namespace Strigi;
53using namespace std;
54
55class SelectedAnalyzerConfiguration : public Strigi::AnalyzerConfiguration {
56public:
57 const set<string> requiredAnalyzers;
58 set<string> obligatoryAnalyzers;
59 mutable set<string> usedAnalyzers;
60 mutable set<string> availableAnalyzers;
61
62 explicit SelectedAnalyzerConfiguration(const set<string> an)
63 : requiredAnalyzers(an) {
64 obligatoryAnalyzers.insert("EventThroughAnalyzer");
65 }
66
67 bool valid() const {
68 return requiredAnalyzers.size() + 1 == usedAnalyzers.size()
69 || requiredAnalyzers.size() == 0;
70 }
71 bool useFactory(const string& name) const {
72 bool use = requiredAnalyzers.find(name) != requiredAnalyzers.end()
73 || obligatoryAnalyzers.find(name) != obligatoryAnalyzers.end()
74 || requiredAnalyzers.size() == 0;
75 if (use) {
76 usedAnalyzers.insert(name);
77 }
78 availableAnalyzers.insert(name);
79 return use;
80 }
81 bool useFactory(StreamEndAnalyzerFactory* f) const {
82 return useFactory(f->name());
83 }
84 bool useFactory(StreamThroughAnalyzerFactory* f) const {
85 return useFactory(f->name());
86 }
87 bool useFactory(StreamSaxAnalyzerFactory* f) const {
88 return useFactory(f->name());
89 }
90 bool useFactory(StreamEventAnalyzerFactory* f) const {
91 return useFactory(f->name());
92 }
93 bool useFactory(StreamLineAnalyzerFactory* f) const {
94 return useFactory(f->name());
95 }
96};
97
98void
99printUsage(char** argv) {
100 fprintf(stderr, "Usage: %s [OPTIONS] SOURCE\n"
101 "Analyze the given file and output the result as XML.\n"
102 " -c configuration file\n"
103 " -a comma-separated list of analyzers\n"
104 " -r reference output, when specified, the reference output is \n"
105 " compared to the given output and the first difference is \n"
106 " reported.\n",
107 argv[0]);
108}
109bool
110containsHelp(int argc, char **argv) {
111 for (int i=1; i<argc; ++i) {
112 if (strcmp(argv[i], "--help") == 0
113 || strcmp(argv[i], "-h") == 0) return true;
114 }
115 return false;
116}
117set<string>
118parseAnalyzerNames(const char* names) {
119 set<string> n;
120 string ns(names);
121 string::size_type start = 0, p = ns.find(',');
122 while (p != string::npos) {
123 n.insert(ns.substr(start, p-start));
124 start = p + 1;
125 p = ns.find(',', start);
126 }
127 n.insert(ns.substr(start));
128 return n;
129}
130set<string>
131parseConfig(const char* config) {
132 set<string> n;
133 ifstream f(config);
134 string line;
135 while (f.good()) {
136 getline(f, line);
137 if (strncmp("analyzer=", line.c_str(), 9) == 0) {
138 n.insert(line.substr(9));
139 }
140 }
141
142 return n;
143}
144/**
145 * Usage: $0 [OPTIONS] SOURCE
146 **/
147int
148main(int argc, char** argv) {
149 setenv("XDG_DATA_HOME", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
150 setenv("XDG_DATA_DIRS", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1);
151 setenv("STRIGI_PLUGIN_PATH", BINARYDIR"/src/streamanalyzer/throughplugins"
152 PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/lineplugins"
153 PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/saxplugins", 1);
154 // there are 2 optional options that both require an argument.
155 // one can specify 1 source, so the number of arguments must be
156 // 2, 4 or 6
157 if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) {
158 printUsage(argv);
159 return -1;
160 }
161
162 set<string> analyzers;
163 const char* targetFile;
164 const char* referenceFile = 0;
165 if (argc == 4) {
166 if (strcmp(argv[1],"-a") == 0) {
167 analyzers = parseAnalyzerNames(argv[2]);
168 } else if (strcmp(argv[1], "-r") == 0) {
169 referenceFile = argv[2];
170 } else if (strcmp(argv[1], "-c") == 0) {
171 analyzers = parseConfig(argv[2]);
172 } else {
173 printUsage(argv);
174 return -1;
175 }
176 targetFile = argv[3];
177 } else if (argc == 6) {
178 if (strcmp(argv[1], "-a") == 0) {
179 analyzers = parseAnalyzerNames(argv[2]);
180 if (strcmp(argv[3], "-r") == 0) {
181 referenceFile = argv[4];
182 }
183 } else if (strcmp(argv[1], "-c") == 0) {
184 analyzers = parseConfig(argv[2]);
185 if (strcmp(argv[3], "-r") == 0) {
186 referenceFile = argv[4];
187 }
188 } else if (strcmp(argv[1], "-r") == 0) {
189 referenceFile = argv[2];
190 if (strcmp(argv[3], "-a") == 0) {
191 analyzers = parseAnalyzerNames(argv[4]);
192 } else if (strcmp(argv[3], "-c") == 0) {
193 analyzers = parseConfig(argv[4]);
194 }
195 } else {
196 printUsage(argv);
197 return -1;
198 }
199 targetFile = argv[5];
200 } else {
201 targetFile = argv[1];
202 }
203
204 const char* mappingFile = 0;
205
206 // check that the target file exists
207 {
208 ifstream filetest(targetFile);
209 if (!filetest.good()) {
210 cerr << "The file '" << targetFile << "' cannot be read." << endl;
211 return 1;
212 }
213 }
214 // check that the result file is ok
215 FileInputStream f(referenceFile);
216 if (referenceFile != 0 && f.status() != Ok) {
217 cerr << "The file '" << referenceFile << "' cannot be read." << endl;
218 return 1;
219 }
220
221 const TagMapping mapping(mappingFile);
222 ostringstream out;
223 out << "<?xml version='1.0' encoding='UTF-8'?>\n<"
224 << mapping.map("metadata");
225 map<string, string>::const_iterator i = mapping.namespaces().begin();
226 while (i != mapping.namespaces().end()) {
227 out << " xmlns:" << i->first << "='" << i->second << "'";
228 i++;
229 }
230 out << ">\n";
231
232 SelectedAnalyzerConfiguration ic(analyzers);
233
234 XmlIndexManager manager(out, mapping);
235 DirAnalyzer analyzer(manager, ic);
236 if (!ic.valid()) {
237 set<string>::const_iterator i;
238 set<string> missing;
239 set_difference(analyzers.begin(), analyzers.end(),
240 ic.availableAnalyzers.begin(), ic.availableAnalyzers.end(),
241 insert_iterator<set<string> >(missing, missing.begin()));
242 if (missing.size() == 1) {
243 fprintf(stderr, "No analyzer with name %s was found.\n",
244 missing.begin()->c_str());
245 } else {
246 cerr << "The analyzers";
247 for (i = missing.begin(); i != missing.end(); ++i) {
248 cerr << ", " << *i;
249 }
250 cerr << " were not found." << endl;
251 }
252 fprintf(stderr, "Choose from:\n");
253 for (i = ic.availableAnalyzers.begin();
254 i != ic.availableAnalyzers.end(); ++i) {
255 cerr << " " << *i << endl;
256 }
257 return 1;
258 }
259
260 // change to the directory of the file to analyze
261 // this ensures a consistent naming of the file uris, regardless of cwd
262 string targetPath(targetFile);
263 string::size_type slashpos = targetPath.rfind('/');
264 if (slashpos == string::npos) {
265 analyzer.analyzeDir(targetFile);
266 } else {
267 if (chdir(targetPath.substr(0,slashpos).c_str()) == -1) {
268 fprintf(stderr, "%s\n", strerror(errno));
269 return -1;
270 }
271 analyzer.analyzeDir(targetPath.substr(slashpos+1).c_str());
272 }
273 string str = out.str();
274 int32_t n = 2*(int32_t)str.length();
275
276 // if no reference file was specified, we output the analysis
277 if (referenceFile == 0) {
278 cout << str;
279 return 0;
280 }
281
282 // load the file to compare with
283 const char* c;
284 n = f.read(c, n, n);
285 if (n < 0) {
286 fprintf(stderr, "Error: %s\n", f.error());
287 return -1;
288 }
289 if (n != (int32_t)out.str().length()) {
290 cout << "output length differs " << out.str().length() << " instead of "
291 << n << endl;
292 return -1;
293 }
294
295 const char* p1 = c;
296 const char* p2 = str.c_str();
297 int32_t n1 = n;
298 string::size_type n2 = str.length();
299 while (n1-- && n2-- && *p1 == *p2) {
300 p1++;
301 p2++;
302 }
303 if (n1 ==0 && (*p1 || *p2)) {
304 cout << "difference at position " << p1-c << endl;
305
306 int32_t m = (80 > str.length())?(int32_t)str.length():80;
307 printf("%i %.*s\n", m, m, str.c_str());
308
309 m = (80 > n)?n:80;
310 printf("%i %.*s\n", m, m, c);
311
312 return -1;
313 }
314
315 return 0;
316}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006,2008 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifdef HAVE_CONFIG_H
21 #include <config.h>
22#endif
23
24#include <strigi/strigiconfig.h>
25#include <strigi/fileinputstream.h>
26#include <strigi/bz2inputstream.h>
27#include <strigi/diranalyzer.h>
28#include <strigi/analyzerconfiguration.h>
29#include "xmlindexwriter.h"
30#include <strigi/streamendanalyzer.h>
31#include <strigi/streamthroughanalyzer.h>
32#include <strigi/streamlineanalyzer.h>
33#include <strigi/streamsaxanalyzer.h>
34#include <strigi/streameventanalyzer.h>
35
36#include <cstdio>
37#include <cstring>
38#include <cerrno>
39#include <algorithm>
40#ifdef HAVE_UNISTD_H
41 #include <unistd.h>
42#endif
43#ifdef HAVE_DIRECT_H
44 #include <direct.h>
45#endif
46
47#include <sstream>
48#include <iostream>
49#include <fstream>
50#include <set>
51using namespace Strigi;
52using namespace std;
53
54/**
55 * Configure analysis bases on what fields we want to extract.
56 **/
57class SelectedFieldConfiguration : public Strigi::AnalyzerConfiguration {
58public:
59 /**
60 * Fields the user has requested to be reported.
61 **/
62 const set<string> requiredFields;
63 /**
64 * Fields that were requested by the user and are provided by some analyzer.
65 **/
66 mutable set<string> usedFields;
67 /**
68 * All fields provided by all analyzers.
69 **/
70 mutable set<string> availableFields;
71
72 explicit SelectedFieldConfiguration(const set<string> af)
73 : requiredFields(af) {}
74
75 /**
76 * The configuration is valid if all fields requested can be supplied by
77 * the active set of analyzer.
78 **/
79 bool valid() const {
80 return requiredFields.size() == usedFields.size();
81 }
82 /**
83 * If a certain field should be reported, return Stored, otherwise return
84 * None.
85 **/
86 FieldType indexType(const Strigi::RegisteredField* f) const {
87 return (requiredFields.find(f->key()) != requiredFields.end())
88 ? Stored :None;
89 }
90 /**
91 * If any of the fields provided by the given analyzer are requested, use
92 * that analyzer for the analysis.
93 **/
94 bool useAnalyzerFactory(const StreamAnalyzerFactory* f) const {
95 bool use = false;
96 vector<const RegisteredField*>::const_iterator i;
97 i = f->registeredFields().begin();
98 const vector<const RegisteredField*>::const_iterator end =
99 f->registeredFields().end();
100 for (; i != end; ++i) {
101 string key((*i)->key());
102 availableFields.insert(key);
103 bool usethis = requiredFields.find(key) != requiredFields.end();
104 if (usethis) {
105 use = true;
106 usedFields.insert((*i)->key());
107 }
108 }
109 return use;
110 }
111 bool useFactory(StreamEndAnalyzerFactory* f) const {
112 return useAnalyzerFactory(f);
113 }
114 bool useFactory(StreamThroughAnalyzerFactory* f) const {
115 return useAnalyzerFactory(f);
116 }
117 bool useFactory(StreamSaxAnalyzerFactory* f) const {
118 return useAnalyzerFactory(f);
119 }
120 bool useFactory(StreamEventAnalyzerFactory* f) const {
121 return useAnalyzerFactory(f);
122 }
123 bool useFactory(StreamLineAnalyzerFactory* f) const {
124 return useAnalyzerFactory(f);
125 }
126};
127
128void
129printUsage(char** argv) {
130 fprintf(stderr, "Usage: %s analyzer file-to-analyze referenceoutputfile\n",
131 argv[0]);
132}
133bool
134containsHelp(int argc, char **argv) {
135 for (int i=1; i<argc; ++i) {
136 if (strcmp(argv[i], "--help") == 0
137 || strcmp(argv[i], "-h") == 0) return true;
138 }
139 return false;
140}
141set<string>
142parseFieldNames(const char* names) {
143 set<string> n;
144 string ns(names);
145 string::size_type start = 0, p = ns.find(',');
146 while (p != string::npos) {
147 n.insert(ns.substr(start, p-start));
148 start = p + 1;
149 p = ns.find(',', start);
150 }
151 n.insert(ns.substr(start));
152 return n;
153}
154/**
155 * Usage: $0 [OPTIONS] SOURCE
156 **/
157int
158main(int argc, char** argv) {
159 // there are 2 optional options that both require an argument.
160 // one can specify 1 source, so the number of arguments must be
161 // 2, 4 or 6
162 if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) {
163 printUsage(argv);
164 return -1;
165 }
166
167 set<string> analyzers;
168 const char* targetFile;
169 const char* referenceFile = 0;
170 if (argc == 4) {
171 if (strcmp(argv[1],"-f") == 0) {
172 analyzers = parseFieldNames(argv[2]);
173 } else if (strcmp(argv[1], "-r") == 0) {
174 referenceFile = argv[2];
175 } else {
176 printUsage(argv);
177 return -1;
178 }
179 targetFile = argv[3];
180 } else if (argc == 6) {
181 if (strcmp(argv[1], "-f") == 0) {
182 analyzers = parseFieldNames(argv[2]);
183 if (strcmp(argv[3], "-r") == 0) {
184 referenceFile = argv[4];
185 }
186 } else if (strcmp(argv[1], "-r") == 0) {
187 referenceFile = argv[2];
188 if (strcmp(argv[3], "-f") == 0) {
189 analyzers = parseFieldNames(argv[4]);
190 }
191 } else {
192 printUsage(argv);
193 return -1;
194 }
195 targetFile = argv[5];
196 } else {
197 targetFile = argv[1];
198 }
199
200 const char* mappingFile = 0;
201
202 // check that the target file exists
203 {
204 ifstream filetest(targetFile);
205 if (!filetest.good()) {
206 cerr << "The file '" << targetFile << "' cannot be read." << endl;
207 return 1;
208 }
209 }
210
211 const TagMapping mapping(mappingFile);
212 ostringstream out;
213 out << "<?xml version='1.0' encoding='UTF-8'?>\n<"
214 << mapping.map("metadata");
215 map<string, string>::const_iterator i = mapping.namespaces().begin();
216 while (i != mapping.namespaces().end()) {
217 out << " xmlns:" << i->first << "='" << i->second << "'";
218 i++;
219 }
220 out << ">\n";
221
222 ostringstream s;
223 SelectedFieldConfiguration ic(analyzers);
224 XmlIndexManager manager(s, mapping);
225 DirAnalyzer analyzer(manager, ic);
226 if (!ic.valid()) {
227 set<string>::const_iterator i;
228 set<string> missing;
229 set_difference(analyzers.begin(), analyzers.end(),
230 ic.availableFields.begin(), ic.availableFields.end(),
231 insert_iterator<set<string> >(missing, missing.begin()));
232 if (missing.size() == 1) {
233 fprintf(stderr, "No field with name %s was found.\n",
234 missing.begin()->c_str());
235 } else {
236 cerr << "The fields ";
237 for (i = missing.begin(); i != missing.end(); ++i) {
238 cerr << ", " << *i;
239 }
240 cerr << " were not found." << endl;
241 }
242 fprintf(stderr, "Choose from:\n");
243 for (i = ic.availableFields.begin(); i != ic.availableFields.end(); ++i) {
244 cerr << " " << *i << endl;
245 }
246 return 1;
247 }
248 if (chdir(argv[1]) == -1) {
249 fprintf(stderr, "%s\n", strerror(errno));
250 return -1;
251 }
252 analyzer.analyzeDir(targetFile);
253 string str = s.str();
254 int32_t n = 2*(int32_t)str.length();
255
256 // if no reference file was specified, we output the analysis
257 if (referenceFile == 0) {
258 cout << str;
259 return 0;
260 }
261
262 // load the file to compare with
263 FileInputStream f(referenceFile);
264 BZ2InputStream bz2(&f);
265 const char* c;
266 n = bz2.read(c, n, n);
267 if (n < 0) {
268 fprintf(stderr, "Error: %s\n", bz2.error());
269 return -1;
270 }
271 if (n != (int32_t)s.str().length()) {
272 cerr << "output length differs " << n << " instead of "
273 << s.str().length() << endl;
274 }
275
276 const char* p1 = c;
277 const char* p2 = str.c_str();
278 int32_t n1 = n;
279 string::size_type n2 = str.length();
280 while (n1-- && n2-- && *p1 == *p2) {
281 p1++;
282 p2++;
283 }
284 if (n1 ==0 && (*p1 || *p2)) {
285 cerr << "difference at position " << p1-c << endl;
286
287 int32_t m = (80 > str.length())?(int32_t)str.length():80;
288 printf("%i %.*s\n", m, m, str.c_str());
289
290 m = (80 > n)?n:80;
291 printf("%i %.*s\n", m, m, c);
292
293 return -1;
294 }
295
296 return 0;
297}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifdef HAVE_CONFIG_H
21 #include <config.h>
22#endif
23
24#include <strigi/strigiconfig.h>
25#include "rdfindexwriter.h"
26#include <strigi/analyzerconfiguration.h>
27#include <strigi/diranalyzer.h>
28#include <strigi/fileinputstream.h>
29#include <iostream>
30#include <cstring>
31#ifdef HAVE_UNISTD_H
32 #include <unistd.h>
33#endif
34#ifdef HAVE_DIRECT_H
35 #include <direct.h>
36#endif
37#include <stdlib.h>
38#include <time.h>
39
40using namespace std;
41using namespace Strigi;
42
43int
44usage(int /*argc*/, char** argv) {
45 fprintf(stderr, "Usage: %s\n [--mappingfile <mappingfile>]\n"
46 " [--lastfiletoskip FILE]\n"
47 " [--stdinmtime mtime]\n [--stdinfilename filename]\n"
48 " [dirs-or-files-to-index]\n"
49 " [-j nthreads]\n",
50 argv[0]);
51 return -1;
52}
53bool
54containsHelp(int argc, char **argv) {
55 for (int i=1; i<argc; ++i) {
56 if (strcmp(argv[i], "--help") == 0
57 || strcmp(argv[i], "-h") == 0) return true;
58 }
59 return false;
60}
61void
62analyzeFromStdin(RdfIndexManager& manager, AnalyzerConfiguration& ac,
63 const string& filename, time_t mtime) {
64 StreamAnalyzer sa(ac);
65 sa.setIndexWriter(*manager.indexWriter());
66 FileInputStream in(stdin, filename.c_str());
67 AnalysisResult result(filename, mtime, *manager.indexWriter(), sa);
68 sa.analyze(result, &in);
69}
70
71int
72main(int argc, char **argv) {
73 vector<string> dirs;
74 int nthreads = 2;
75 const char* mappingfile = 0;
76 string lastFileToSkip;
77 time_t stdinMTime = time(0);
78 string stdinFilename = "-";
79 int i = 0;
80 while (++i < argc) {
81 const char* arg = argv[i];
82 if (!strcmp("-h", arg) || !strcmp("--help", arg)) {
83 return usage(argc, argv);
84 }
85 if (!strcmp("-j", arg)) {
86 if (++i == argc) {
87 return usage(argc, argv);
88 }
89 char* end;
90 nthreads = (int)strtol(argv[i], &end, 10);
91 if (end == argv[i] || nthreads < 1) {
92 return usage(argc, argv);
93 }
94 } else if (!strcmp("--mappingfile", arg)) {
95 if (++i == argc) {
96 return usage(argc, argv);
97 }
98 mappingfile = argv[i];
99 } else if (!strcmp("--lastfiletoskip", arg)) {
100 if (++i == argc) {
101 return usage(argc, argv);
102 }
103 lastFileToSkip = argv[i];
104 } else if (!strcmp("--stdinmtime", arg)) {
105 if (++i == argc) {
106 return usage(argc, argv);
107 }
108 char* end;
109 stdinMTime = strtol(argv[i], &end, 10);
110 if (end == argv[i] || stdinMTime < 1) {
111 return usage(argc, argv);
112 }
113 } else if (!strcmp("--stdinfilename", arg)) {
114 if (++i == argc) {
115 return usage(argc, argv);
116 }
117 stdinFilename = argv[i];
118 } else {
119 const char* dir = argv[i];
120 // remove trailing '/'
121 size_t len = strlen(dir);
122 if (dir[len-1] == '/') {
123 dirs.push_back(std::string(dir, len-1));
124 } else {
125 dirs.push_back(dir);
126 }
127 }
128 }
129
130 if (dirs.size() == 0) {
131 char buf[1024];
132 if (getcwd(buf, 1023) == NULL) {
133 return -1;
134 }
135 dirs.push_back(buf);
136 }
137
138 vector<pair<bool,string> >filters;
139 filters.push_back(make_pair<bool,string>(false,".*/"));
140 filters.push_back(make_pair<bool,string>(false,".*"));
141 AnalyzerConfiguration ic;
142 ic.setFilters(filters);
143
144 const TagMapping mapping(mappingfile);
145/* cout << "<?xml version='1.0' encoding='UTF-8'?>\n<"
146 << mapping.map("metadata");
147 map<string, string>::const_iterator k = mapping.namespaces().begin();
148 while (k != mapping.namespaces().end()) {
149 cout << " xmlns:" << k->first << "='" << k->second << "'";
150 k++;
151 }
152 cout << ">\n";
153*/
154 rdfset rdf;
155
156 RdfIndexManager manager(cout, mapping, rdf);
157 DirAnalyzer analyzer(manager, ic);
158 for (unsigned i = 0; i < dirs.size(); ++i) {
159 if (dirs[i] == "-") {
160 analyzeFromStdin(manager, ic, stdinFilename, stdinMTime);
161 } else {
162 analyzer.analyzeDir(dirs[i], nthreads, 0, lastFileToSkip);
163 }
164 }
165// cout << "</" << mapping.map("metadata") << ">\n";
166
167
168 for(rdfset::const_iterator subj = rdf.begin(); subj != rdf.end(); subj++) {
169 cout<< "<" << subj->first << ">";
170
171 std::map<std::string, std::list<std::string> >::const_iterator pred = subj->second.begin();
172 do {
173 cout << "\n\t<" << pred->first << "> ";
174
175 std::list<std::string>::const_iterator obj = pred->second.begin();
176 do {
177 cout << "\n\t\t\"" << *obj << "\"";
178 obj++;
179 if(obj != pred->second.end())
180 cout <<",";
181 } while (obj != pred->second.end());
182 pred++;
183 if(pred!=subj->second.end())
184 cout << ";";
185 } while(pred!=subj->second.end());
186 cout<< ".\n";
187 }
188
189
190 return 0;
191}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "rdfindexwriter.h"
21using namespace std;
22using namespace Strigi;
23
24void
25RdfIndexWriter::initWriterData(const FieldRegister& f) {
26 map<string, RegisteredField*>::const_iterator i;
27 map<string, RegisteredField*>::const_iterator end(f.fields().end());
28 for (i = f.fields().begin(); i != end; ++i) {
29 Tag* tag = static_cast<Tag*>(i->second->writerData());
30 if (tag) {
31 tag->refcount++;
32 continue;
33 }
34 tag = new Tag();
35 tag->refcount = 1;
36 const string s(i->first);
37 const string& n = mapping.map(s);
38 if (s == n) {
39 tag->open = " <value name='" + n + "'>";
40 tag->close = "</value>\n";
41 } else {
42 tag->open = " <" + n + '>';
43 tag->close = "</" + n + ">\n";
44 }
45 i->second->setWriterData(tag);
46 }
47}
48void
49RdfIndexWriter::releaseWriterData(const FieldRegister& f) {
50 map<string, RegisteredField*>::const_iterator i;
51 map<string, RegisteredField*>::const_iterator end(f.fields().end());
52 for (i = f.fields().begin(); i != end; ++i) {
53 Tag* tag = static_cast<Tag*>(i->second->writerData());
54 if (tag->refcount-- == 1) {
55 //fprintf(stderr, "free for %s\n", i->second->key().c_str());
56 delete tag;
57 i->second->setWriterData(0);
58 }
59 }
60}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef RDFINDEXWRITER_H
21#define RDFINDEXWRITER_H
22
23#include <strigi/indexwriter.h>
24#include <strigi/indexmanager.h>
25#include <strigi/analysisresult.h>
26#include "tagmapping.h"
27#include <strigi/fieldtypes.h>
28#include <strigi/analyzerconfiguration.h>
29#include <strigi/strigi_thread.h>
30#include <iostream>
31#include <sstream>
32#include <map>
33#include <list>
34
35typedef std::map<std::string, std::map<std::string, std::list<std::string> > > rdfset;
36
37class RdfIndexWriter : public Strigi::IndexWriter {
38private:
39 struct Data {
40 std::multimap<const Strigi::RegisteredField*, std::string> values;
41 std::string text;
42 };
43 std::map<STRIGI_THREAD_TYPE, std::vector<Data*> > data;
44 struct Tag {
45 std::string open;
46 std::string close;
47 int refcount;
48 };
49
50 STRIGI_MUTEX_DEFINE(mutex);
51 std::ostream& out;
52
53 rdfset& rdf;
54
55 const TagMapping& mapping;
56
57 void printText(const std::string& text) {
58 const char* p = text.c_str();
59 const char* end = p + text.size();
60 char nb = 0;
61 bool lastwhite = true;
62 while (p < end) {
63 char c = *p;
64 if (nb) {
65 if ((0xC0 & c) != 0x80) {
66 return;
67 }
68 out.put(c);
69 nb--;
70 } else if ((0xE0 & c) == 0xC0) {
71 nb = 1;
72 out.put(c);
73 } else if ((0xF0 & c) == 0xE0) {
74 nb = 2;
75 out.put(c);
76 } else if ((0xF8 & c) == 0xF0) {
77 nb = 3;
78 out.put(c);
79 } else if (c <= 8) {
80 return;
81 } else if (c == '&') {
82 out << "&amp;";
83 } else if (c == '<') {
84 out << "&lt;";
85 } else if (c == '>') {
86 out << "&gt;";
87 } else if (isspace(c) != 0) {
88 // we've to handle dos formatting
89 //'\r' char is ignored, it isn't wroten to out and doesn't
90 //change lastwhite value (so the following '\n' will be handled)
91 if (!lastwhite && (c!= '\r')) {
92 out.put(c);
93 lastwhite = true;
94 }
95 } else {
96 lastwhite = false;
97 out.put(c);
98 }
99 p++;
100 }
101 }
102 static void escape(std::string& value) {
103 int namp, nlt, ngt, napos, nexcept;
104 namp = nlt = ngt = napos = nexcept = 0;
105 const char* p = value.c_str();
106 const char* end = p + value.size();
107 char nb = 0;
108 while (p < end) {
109 char c = *p;
110 if (nb) {
111 if ((0xC0 & c) != 0x80) {
112 value = "";
113 return;
114 }
115 nb--;
116 } else if ((0xE0 & c) == 0xC0) {
117 nb = 1;
118 } else if ((0xF0 & c) == 0xE0) {
119 nb = 2;
120 } else if ((0xF8 & c) == 0xF0) {
121 nb = 3;
122 } else if (c < 32 && c != 9 && c != 10 && c != 12) {
123 nexcept++;
124 } else if (c == '&') {
125 namp++;
126 } else if (c == '<') {
127 nlt++;
128 } else if (c == '>') {
129 ngt++;
130 } else if (c == '\'') {
131 napos++;
132 }
133 p++;
134 }
135 // if no character has to be escaped, just return
136 if (!(namp||nlt||ngt|napos|nexcept)) {
137 return;
138 }
139
140 std::string ov(value);
141 p = ov.c_str();
142 end = p + ov.size();
143 int newsize = (int)value.size()+4*namp+3*(nlt+ngt)+5*napos+3*nexcept;
144 value.clear();
145 value.reserve(newsize);
146 while (p < end) {
147 char c = *p;
148 if (nb) {
149 if ((0xC0 & c) != 0x80) {
150 value = "";
151 return;
152 }
153 nb--;
154 value += c;
155 } else if ((0xE0 & c) == 0xC0) {
156 nb = 1;
157 value += c;
158 } else if ((0xF0 & c) == 0xE0) {
159 nb = 2;
160 value += c;
161 } else if ((0xF8 & c) == 0xF0) {
162 nb = 3;
163 value += c;
164 } else if (c < 32 && c != 9 && c != 10 && c != 12) {
165 char s[4];
166 snprintf(s, 4, "%%%2x", (unsigned char)c);
167 value += s;
168 } else if (c == '&') {
169 value += "&amp;";
170 } else if (c == '<') {
171 value += "&lt;";
172 } else if (c == '>') {
173 value += "&gt;";
174 } else if (c == '\'') {
175 value += "&apos;";
176 } else {
177 value += c;
178 }
179 p++;
180 }
181 }
182protected:
183 void startAnalysis(const Strigi::AnalysisResult* ar) {
184 STRIGI_MUTEX_LOCK(&mutex);
185 std::vector<Data*>& dv = data[STRIGI_THREAD_SELF()];
186 STRIGI_MUTEX_UNLOCK(&mutex);
187 unsigned char depth = ar->depth();
188 if (depth >= dv.size()) {
189 dv.push_back(new Data());
190 }
191 Data* d = dv[depth];
192 ar->setWriterData(d);
193 }
194 void printValue(const Strigi::AnalyzerConfiguration& config,
195 const Strigi::RegisteredField* name, std::string& value) {
196 if (config.indexType(name) != Strigi::AnalyzerConfiguration::None) {
197 const Tag* tag = static_cast<const Tag*>(name->writerData());
198 escape(value);
199 out << tag->open << value << tag->close;
200 }
201 }
202 void finishAnalysis(const Strigi::AnalysisResult* ar) {
203 STRIGI_MUTEX_LOCK(&mutex);
204 Data* d = static_cast<Data*>(ar->writerData());
205 const Strigi::AnalyzerConfiguration& config = ar->config();
206 //const Strigi::FieldRegister& fr = config.fieldRegister();
207 std::string v = ar->path();
208 escape(v);
209/* out << " <" << mapping.map("file") << " " << mapping.map("uri")
210 << "='" << v << "' " << mapping.map("mtime") << "='"
211 << (int)ar->mTime()
212 << "'>\n";
213
214 if (ar->encoding().size()) {
215 v.assign(ar->encoding());
216 printValue(config, fr.encodingField, v);
217 }
218
219 std::multimap<const Strigi::RegisteredField*, std::string>::iterator
220 i, end;
221 end = d->values.end();
222 for (i = d->values.begin(); i != end; ++i) {
223 printValue(config, i->first, i->second);
224 }
225 std::ostringstream oss;
226 oss << (int)ar->depth();
227 v = oss.str();
228 printValue(config, fr.embeddepthField, v);
229 if (d->text.size() > 0) {
230 out << " <text>";
231 printText(d->text);
232 out << "</text>\n";
233 }
234 out << " </" << mapping.map("file") << ">\n";
235*/
236 STRIGI_MUTEX_UNLOCK(&mutex);
237
238 std::string subj = d->values.find(config.fieldRegister().pathField)->second;
239
240 for (std::multimap<const Strigi::RegisteredField*, std::string>::iterator i = d->values.begin();
241 i != d->values.end(); i++) {
242 addTriplet(subj, i->first->key(), i->second);
243 }
244 if (!d->text.empty())
245 addTriplet(subj,"http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent",d->text);
246
247 d->values.clear();
248 d->text.assign("");
249 }
250 void addText(const Strigi::AnalysisResult* ar, const char* text,
251 int32_t length) {
252 Data* d = static_cast<Data*>(ar->writerData());
253 if (d->text.size() < 10000000) {
254 d->text.append(text, length);
255 d->text.append("\n");
256 }
257 }
258 void addValue(const Strigi::AnalysisResult* ar,
259 const Strigi::RegisteredField* field, const std::string& value) {
260 Data* d = static_cast<Data*>(ar->writerData());
261 d->values.insert(
262 std::make_pair<const Strigi::RegisteredField* const, std::string>(
263 field, value));
264 }
265 void addValue(const Strigi::AnalysisResult* ar,
266 const Strigi::RegisteredField* field,
267 const unsigned char* data, uint32_t size) {
268 Data* d = static_cast<Data*>(ar->writerData());
269 d->values.insert(
270 std::make_pair<const Strigi::RegisteredField* const, std::string>(
271 field, std::string((const char*)data, size)));
272 }
273 void addValue(const Strigi::AnalysisResult* ar,
274 const Strigi::RegisteredField* field, uint32_t value) {
275 Data* d = static_cast<Data*>(ar->writerData());
276 static std::ostringstream v;
277 v.str("");
278 v << value;
279 d->values.insert(
280 std::make_pair<const Strigi::RegisteredField* const, std::string>(
281 field, v.str()));
282 }
283 void addValue(const Strigi::AnalysisResult* ar,
284 const Strigi::RegisteredField* field, int32_t value) {
285 Data* d = static_cast<Data*>(ar->writerData());
286 static std::ostringstream v;
287 v.str("");
288 v << value;
289 d->values.insert(
290 std::make_pair<const Strigi::RegisteredField* const, std::string>(
291 field, v.str()));
292 }
293 void addValue(const Strigi::AnalysisResult* ar,
294 const Strigi::RegisteredField* field, double value) {
295 Data* d = static_cast<Data*>(ar->writerData());
296 static std::ostringstream v;
297 v.str("");
298 v << value;
299 d->values.insert(
300 std::make_pair<const Strigi::RegisteredField* const, std::string>(
301 field, v.str()));
302 }
303 void addTriplet(const std::string& subject,
304 const std::string& predicate, const std::string& object) {
305 STRIGI_MUTEX_LOCK(&mutex);
306 rdf[subject][predicate].push_back(object);
307 STRIGI_MUTEX_UNLOCK(&mutex);
308 }
309 void addValue(const Strigi::AnalysisResult*,
310 const Strigi::RegisteredField* field, const std::string& name,
311 const std::string& value) {}
312 void initWriterData(const Strigi::FieldRegister&);
313 void releaseWriterData(const Strigi::FieldRegister&);
314public:
315 explicit RdfIndexWriter(std::ostream& o, const TagMapping& m, rdfset& r)
316 :out(o), rdf(r), mapping(m) {
317 STRIGI_MUTEX_INIT(&mutex);
318 }
319 ~RdfIndexWriter() {
320 std::map<STRIGI_THREAD_TYPE, std::vector<Data*> >::const_iterator j;
321 for (j = data.begin(); j != data.end(); ++j) {
322 std::vector<Data*>::const_iterator i;
323 for (i = j->second.begin(); i != j->second.end(); ++i) {
324 delete *i;
325 }
326 }
327 STRIGI_MUTEX_DESTROY(&mutex);
328 }
329 void commit() {}
330 void deleteEntries(const std::vector<std::string>& entries) {}
331 void deleteAllEntries() {}
332};
333
334class RdfIndexManager : public Strigi::IndexManager {
335private:
336 RdfIndexWriter writer;
337public:
338 RdfIndexManager(std::ostream& o, const TagMapping& m, rdfset& r) :writer(o, m, r) {}
339 Strigi::IndexWriter* indexWriter() {
340 return &writer;
341 }
342 Strigi::IndexReader* indexReader() {
343 return 0;
344 }
345};
346
347#endif
  
1rdf:http://www.w3.org/1999/02/22-rdf-syntax-ns#
2dc:http://purl.org/dc/elements/1.1/
3audio:eh?missing?
4metadata rdf:RDF
5file rdf:Description
6uri rdf:about
7title audio:title
8artist audio:artist
9album audio:album
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "tagmapping.h"
21#include <iostream>
22#include <fstream>
23using namespace std;
24
25TagMapping::TagMapping(const char* path) {
26 if (path == 0) return;
27 ifstream file(path);
28 string line;
29 for (;;) {
30 getline(file, line);
31 if (!file.good()) {
32 break;
33 }
34 string::size_type p = line.find('\t');
35 if (p != string::npos) {
36 mapping[line.substr(0, p)] = line.substr(p+1);
37 } else {
38 p = line.find(':');
39 if (p != string::npos) {
40 m_namespaces[line.substr(0, p)] = line.substr(p+1);
41 }
42 }
43 }
44}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef TAGMAPPING_H
21#define TAGMAPPPING_H
22
23#include <map>
24#include <string>
25
26class TagMapping {
27private:
28 std::map<std::string, std::string> m_namespaces;
29 std::map<std::string, std::string> mapping;
30public:
31 TagMapping(const char* mappingfile);
32 const std::map<std::string, std::string>& namespaces() const {
33 return m_namespaces;
34 }
35 const std::string& map(const std::string& key) const {
36 std::map<std::string, std::string>::const_iterator i
37 = mapping.find(key);
38 return (i == mapping.end()) ?key :i->second;
39 }
40};
41
42#endif
  
1#! /usr/bin/python
2import sys
3import time
4from xml.sax import make_parser, handler, SAXException
5
6class UriLogger(handler.ContentHandler):
7
8 def __init__(self):
9 self.count = 0
10 self.start = 0
11
12 def startElement(self, name, attrs):
13 if attrs.has_key('uri'):
14 if (self.start == 0):
15 self.start = time.time()
16 self.uri = attrs['uri']
17 self.count += 1
18 if (self.count % 1000 == 0):
19 elapsed = time.time() - self.start
20 elapsed = self.count/elapsed
21 print '%9d %9d %s' % (self.count, elapsed, self.uri)
22
23 def endDocument(self):
24 elapsed = time.time() - self.start
25 elapsed = (self.count-1)/elapsed
26 print '%9d %9d' % (self.count, elapsed)
27
28# this script reads from standard input and parses it as xml
29# if the xml is invalid, it will print an error message
30
31parser = make_parser()
32urilogger = UriLogger()
33parser.setContentHandler(urilogger)
34
35try:
36 parser.parse(sys.stdin)
37except SAXException, e :
38 if hasattr(urilogger, 'uri'):
39 print "Error after "+urilogger.uri
40 print e;
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifdef HAVE_CONFIG_H
21 #include <config.h>
22#endif
23
24#include <strigi/strigiconfig.h>
25#include "xmlindexwriter.h"
26#include <strigi/analyzerconfiguration.h>
27#include <strigi/diranalyzer.h>
28#include <strigi/fileinputstream.h>
29#include <iostream>
30#include <cstring>
31#ifdef HAVE_UNISTD_H
32 #include <unistd.h>
33#endif
34#ifdef HAVE_DIRECT_H
35 #include <direct.h>
36#endif
37#include <stdlib.h>
38#include <time.h>
39
40using namespace std;
41using namespace Strigi;
42
43int
44usage(int /*argc*/, char** argv) {
45 fprintf(stderr, "Usage: %s\n [--mappingfile <mappingfile>]\n"
46 " [--lastfiletoskip FILE]\n"
47 " [--stdinmtime mtime]\n [--stdinfilename filename]\n"
48 " [dirs-or-files-to-index]\n"
49 " [-j nthreads]\n",
50 argv[0]);
51 return -1;
52}
53bool
54containsHelp(int argc, char **argv) {
55 for (int i=1; i<argc; ++i) {
56 if (strcmp(argv[i], "--help") == 0
57 || strcmp(argv[i], "-h") == 0) return true;
58 }
59 return false;
60}
61void
62analyzeFromStdin(XmlIndexManager& manager, AnalyzerConfiguration& ac,
63 const string& filename, time_t mtime) {
64 StreamAnalyzer sa(ac);
65 sa.setIndexWriter(*manager.indexWriter());
66 FileInputStream in(stdin, filename.c_str());
67 AnalysisResult result(filename, mtime, *manager.indexWriter(), sa);
68 sa.analyze(result, &in);
69}
70
71int
72main(int argc, char **argv) {
73 vector<string> dirs;
74 int nthreads = 2;
75 const char* mappingfile = 0;
76 string lastFileToSkip;
77 time_t stdinMTime = time(0);
78 string stdinFilename = "-";
79 int i = 0;
80 while (++i < argc) {
81 const char* arg = argv[i];
82 if (!strcmp("-h", arg) || !strcmp("--help", arg)) {
83 return usage(argc, argv);
84 }
85 if (!strcmp("-j", arg)) {
86 if (++i == argc) {
87 return usage(argc, argv);
88 }
89 char* end;
90 nthreads = (int)strtol(argv[i], &end, 10);
91 if (end == argv[i] || nthreads < 1) {
92 return usage(argc, argv);
93 }
94 } else if (!strcmp("--mappingfile", arg)) {
95 if (++i == argc) {
96 return usage(argc, argv);
97 }
98 mappingfile = argv[i];
99 } else if (!strcmp("--lastfiletoskip", arg)) {
100 if (++i == argc) {
101 return usage(argc, argv);
102 }
103 lastFileToSkip = argv[i];
104 } else if (!strcmp("--stdinmtime", arg)) {
105 if (++i == argc) {
106 return usage(argc, argv);
107 }
108 char* end;
109 stdinMTime = strtol(argv[i], &end, 10);
110 if (end == argv[i] || stdinMTime < 1) {
111 return usage(argc, argv);
112 }
113 } else if (!strcmp("--stdinfilename", arg)) {
114 if (++i == argc) {
115 return usage(argc, argv);
116 }
117 stdinFilename = argv[i];
118 } else {
119 const char* dir = argv[i];
120 // remove trailing '/'
121 size_t len = strlen(dir);
122 if (dir[len-1] == '/') {
123 dirs.push_back(std::string(dir, len-1));
124 } else {
125 dirs.push_back(dir);
126 }
127 }
128 }
129
130 if (dirs.size() == 0) {
131 char buf[1024];
132 if (getcwd(buf, 1023) == NULL) {
133 return -1;
134 }
135 dirs.push_back(buf);
136 }
137
138 vector<pair<bool,string> >filters;
139 filters.push_back(make_pair<bool,string>(false,".*/"));
140 filters.push_back(make_pair<bool,string>(false,".*"));
141 AnalyzerConfiguration ic;
142 ic.setFilters(filters);
143
144 const TagMapping mapping(mappingfile);
145 cout << "<?xml version='1.0' encoding='UTF-8'?>\n<"
146 << mapping.map("metadata");
147 map<string, string>::const_iterator k = mapping.namespaces().begin();
148 while (k != mapping.namespaces().end()) {
149 cout << " xmlns:" << k->first << "='" << k->second << "'";
150 k++;
151 }
152 cout << ">\n";
153
154 XmlIndexManager manager(cout, mapping);
155 DirAnalyzer analyzer(manager, ic);
156 for (unsigned i = 0; i < dirs.size(); ++i) {
157 if (dirs[i] == "-") {
158 analyzeFromStdin(manager, ic, stdinFilename, stdinMTime);
159 } else {
160 analyzer.analyzeDir(dirs[i], nthreads, 0, lastFileToSkip);
161 }
162 }
163 cout << "</" << mapping.map("metadata") << ">\n";
164
165 return 0;
166}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#include "xmlindexwriter.h"
21using namespace std;
22using namespace Strigi;
23
24void
25XmlIndexWriter::initWriterData(const FieldRegister& f) {
26 map<string, RegisteredField*>::const_iterator i;
27 map<string, RegisteredField*>::const_iterator end(f.fields().end());
28 for (i = f.fields().begin(); i != end; ++i) {
29 Tag* tag = static_cast<Tag*>(i->second->writerData());
30 if (tag) {
31 tag->refcount++;
32 continue;
33 }
34 tag = new Tag();
35 tag->refcount = 1;
36 const string s(i->first);
37 const string& n = mapping.map(s);
38 if (s == n) {
39 tag->open = " <value name='" + n + "'>";
40 tag->close = "</value>\n";
41 } else {
42 tag->open = " <" + n + '>';
43 tag->close = "</" + n + ">\n";
44 }
45 i->second->setWriterData(tag);
46 }
47}
48void
49XmlIndexWriter::releaseWriterData(const FieldRegister& f) {
50 map<string, RegisteredField*>::const_iterator i;
51 map<string, RegisteredField*>::const_iterator end(f.fields().end());
52 for (i = f.fields().begin(); i != end; ++i) {
53 Tag* tag = static_cast<Tag*>(i->second->writerData());
54 if (tag->refcount-- == 1) {
55 //fprintf(stderr, "free for %s\n", i->second->key().c_str());
56 delete tag;
57 i->second->setWriterData(0);
58 }
59 }
60}
  
1/* This file is part of Strigi Desktop Search
2 *
3 * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20#ifndef XMLINDEXWRITER_H
21#define XMLINDEXWRITER_H
22
23#include "tagmapping.h"
24#include <strigi/indexwriter.h>
25#include <strigi/indexmanager.h>
26#include <strigi/analysisresult.h>
27#include <strigi/fieldtypes.h>
28#include <strigi/analyzerconfiguration.h>
29#include <strigi/strigi_thread.h>
30#include <iostream>
31#include <sstream>
32#include <map>
33
34class XmlIndexWriter : public Strigi::IndexWriter {
35private:
36 struct Data {
37 std::multimap<const Strigi::RegisteredField*, std::string> values;
38 std::string text;
39 };
40 std::map<STRIGI_THREAD_TYPE, std::vector<Data*> > data;
41 struct Tag {
42 std::string open;
43 std::string close;
44 int refcount;
45 };
46
47 STRIGI_MUTEX_DEFINE(mutex);
48 std::ostream& out;
49
50 const TagMapping& mapping;
51
52 void printText(const std::string& text) {
53 const char* p = text.c_str();
54 const char* end = p + text.size();
55 char nb = 0;
56 bool lastwhite = true;
57 while (p < end) {
58 char c = *p;
59 if (nb) {
60 if ((0xC0 & c) != 0x80) {
61 return;
62 }
63 out.put(c);
64 nb--;
65 } else if ((0xE0 & c) == 0xC0) {
66 nb = 1;
67 out.put(c);
68 } else if ((0xF0 & c) == 0xE0) {
69 nb = 2;
70 out.put(c);
71 } else if ((0xF8 & c) == 0xF0) {
72 nb = 3;
73 out.put(c);
74 } else if (c <= 8) {
75 return;
76 } else if (c == '&') {
77 out << "&amp;";
78 } else if (c == '<') {
79 out << "&lt;";
80 } else if (c == '>') {
81 out << "&gt;";
82 } else if (isspace(c) != 0) {
83 // we've to handle dos formatting
84 //'\r' char is ignored, it isn't wroten to out and doesn't
85 //change lastwhite value (so the following '\n' will be handled)
86 if (!lastwhite && (c!= '\r')) {
87 out.put(c);
88 lastwhite = true;
89 }
90 } else {
91 lastwhite = false;
92 out.put(c);
93 }
94 p++;
95 }
96 }
97 static void escape(std::string& value) {
98 int namp, nlt, ngt, napos, nexcept;
99 namp = nlt = ngt = napos = nexcept = 0;
100 const char* p = value.c_str();
101 const char* end = p + value.size();
102 char nb = 0;
103 while (p < end) {
104 char c = *p;
105 if (nb) {
106 if ((0xC0 & c) != 0x80) {
107 value = "";
108 return;
109 }
110 nb--;
111 } else if ((0xE0 & c) == 0xC0) {
112 nb = 1;
113 } else if ((0xF0 & c) == 0xE0) {
114 nb = 2;
115 } else if ((0xF8 & c) == 0xF0) {
116 nb = 3;
117 } else if (c < 32 && c != 9 && c != 10 && c != 12) {
118 nexcept++;
119 } else if (c == '&') {
120 namp++;
121 } else if (c == '<') {
122 nlt++;
123 } else if (c == '>') {
124 ngt++;
125 } else if (c == '\'') {
126 napos++;
127 }
128 p++;
129 }
130 // if no character has to be escaped, just return
131 if (!(namp||nlt||ngt|napos|nexcept)) {
132 return;
133 }
134
135 std::string ov(value);
136 p = ov.c_str();
137 end = p + ov.size();
138 int newsize = (int)value.size()+4*namp+3*(nlt+ngt)+5*napos+3*nexcept;
139 value.clear();
140 value.reserve(newsize);
141 while (p < end) {
142 char c = *p;
143 if (nb) {
144 if ((0xC0 & c) != 0x80) {
145 value = "";
146 return;
147 }
148 nb--;
149 value += c;
150 } else if ((0xE0 & c) == 0xC0) {
151 nb = 1;
152 value += c;
153 } else if ((0xF0 & c) == 0xE0) {
154 nb = 2;
155 value += c;
156 } else if ((0xF8 & c) == 0xF0) {
157 nb = 3;
158 value += c;
159 } else if (c < 32 && c != 9 && c != 10 && c != 12) {
160 char s[4];
161 snprintf(s, 4, "%%%2x", (unsigned char)c);
162 value += s;
163 } else if (c == '&') {
164 value += "&amp;";
165 } else if (c == '<') {
166 value += "&lt;";
167 } else if (c == '>') {
168 value += "&gt;";
169 } else if (c == '\'') {
170 value += "&apos;";
171 } else {
172 value += c;
173 }
174 p++;
175 }
176 }
177protected:
178 void startAnalysis(const Strigi::AnalysisResult* ar) {
179 STRIGI_MUTEX_LOCK(&mutex);
180 std::vector<Data*>& dv = data[STRIGI_THREAD_SELF()];
181 STRIGI_MUTEX_UNLOCK(&mutex);
182 unsigned char depth = ar->depth();
183 if (depth >= dv.size()) {
184 dv.push_back(new Data());
185 }
186 Data* d = dv[depth];
187 ar->setWriterData(d);
188 }
189 void printValue(const Strigi::AnalyzerConfiguration& config,
190 const Strigi::RegisteredField* name, std::string& value) {
191 if (config.indexType(name) != Strigi::AnalyzerConfiguration::None) {
192 const Tag* tag = static_cast<const Tag*>(name->writerData());
193 escape(value);
194 out << tag->open << value << tag->close;
195 }
196 }
197 void finishAnalysis(const Strigi::AnalysisResult* ar) {
198 STRIGI_MUTEX_LOCK(&mutex);
199 Data* d = static_cast<Data*>(ar->writerData());
200 const Strigi::AnalyzerConfiguration& config = ar->config();
201 const Strigi::FieldRegister& fr = config.fieldRegister();
202 std::string v = ar->path();
203 escape(v);
204 out << " <" << mapping.map("file") << " " << mapping.map("uri")
205 << "='" << v << "' " << mapping.map("mtime") << "='"
206 << (int)ar->mTime()
207 << "'>\n";
208
209 if (ar->encoding().size()) {
210 v.assign(ar->encoding());
211 printValue(config, fr.encodingField, v);
212 }
213
214 std::multimap<const Strigi::RegisteredField*, std::string>::iterator
215 i, end;
216 end = d->values.end();
217 for (i = d->values.begin(); i != end; ++i) {
218 printValue(config, i->first, i->second);
219 }
220 std::ostringstream oss;
221 oss << (int)ar->depth();
222 v = oss.str();
223 printValue(config, fr.embeddepthField, v);
224 if (d->text.size() > 0) {
225 out << " <text>";
226 printText(d->text);
227 out << "</text>\n";
228 }
229 out << " </" << mapping.map("file") << ">\n";
230 STRIGI_MUTEX_UNLOCK(&mutex);
231 d->values.clear();
232 d->text.assign("");
233 }
234 void addText(const Strigi::AnalysisResult* ar, const char* text,
235 int32_t length) {
236 Data* d = static_cast<Data*>(ar->writerData());
237 if (d->text.size() < 10000000) {
238 d->text.append(text, length);
239 d->text.append("\n");
240 }
241 }
242 void addValue(const Strigi::AnalysisResult* ar,
243 const Strigi::RegisteredField* field, const std::string& value) {
244 Data* d = static_cast<Data*>(ar->writerData());
245 d->values.insert(
246 std::make_pair<const Strigi::RegisteredField* const, std::string>(
247 field, value));
248 }
249 void addValue(const Strigi::AnalysisResult* ar,
250 const Strigi::RegisteredField* field,
251 const unsigned char* data, uint32_t size) {
252 Data* d = static_cast<Data*>(ar->writerData());
253 d->values.insert(
254 std::make_pair<const Strigi::RegisteredField* const, std::string>(
255 field, std::string((const char*)data, size)));
256 }
257 void addValue(const Strigi::AnalysisResult* ar,
258 const Strigi::RegisteredField* field, uint32_t value) {
259 Data* d = static_cast<Data*>(ar->writerData());
260 static std::ostringstream v;
261 v.str("");
262 v << value;
263 d->values.insert(
264 std::make_pair<const Strigi::RegisteredField* const, std::string>(
265 field, v.str()));
266 }
267 void addValue(const Strigi::AnalysisResult* ar,
268 const Strigi::RegisteredField* field, int32_t value) {
269 Data* d = static_cast<Data*>(ar->writerData());
270 static std::ostringstream v;
271 v.str("");
272 v << value;
273 d->values.insert(
274 std::make_pair<const Strigi::RegisteredField* const, std::string>(
275 field, v.str()));
276 }
277 void addValue(const Strigi::AnalysisResult* ar,
278 const Strigi::RegisteredField* field, double value) {
279 Data* d = static_cast<Data*>(ar->writerData());
280 static std::ostringstream v;
281 v.str("");
282 v << value;
283 d->values.insert(
284 std::make_pair<const Strigi::RegisteredField* const, std::string>(
285 field, v.str()));
286 }
287 void addTriplet(const std::string& subject,
288 const std::string& predicate, const std::string& object) {}
289 void addValue(const Strigi::AnalysisResult*,
290 const Strigi::RegisteredField* field, const std::string& name,
291 const std::string& value) {}
292 void initWriterData(const Strigi::FieldRegister&);
293 void releaseWriterData(const Strigi::FieldRegister&);
294public:
295 explicit XmlIndexWriter(std::ostream& o, const TagMapping& m)
296 :out(o), mapping(m) {
297 STRIGI_MUTEX_INIT(&mutex);
298 }
299 ~XmlIndexWriter() {
300 std::map<STRIGI_THREAD_TYPE, std::vector<Data*> >::const_iterator j;
301 for (j = data.begin(); j != data.end(); ++j) {
302 std::vector<Data*>::const_iterator i;
303 for (i = j->second.begin(); i != j->second.end(); ++i) {
304 delete *i;
305 }
306 }
307 STRIGI_MUTEX_DESTROY(&mutex);
308 }
309 void commit() {}
310 void deleteEntries(const std::vector<std::string>& entries) {}
311 void deleteAllEntries() {}
312};
313
314class XmlIndexManager : public Strigi::IndexManager {
315private:
316 XmlIndexWriter writer;
317public:
318 XmlIndexManager(std::ostream& o, const TagMapping& m) :writer(o, m) {}
319 Strigi::IndexWriter* indexWriter() {
320 return &writer;
321 }
322 Strigi::IndexReader* indexReader() {
323 return 0;
324 }
325};
326
327#endif