Commit 932f6f6e9d978d9c21198b0bc3f41baf968990f1
- Diff rendering mode:
- inline
- side by side
CMakeLists.txt
(1 / 0)
|   | |||
| 10 | 10 | ||
| 11 | 11 | add_subdirectory(libstreams) | |
| 12 | 12 | add_subdirectory(libstreamanalyzer) | |
| 13 | add_subdirectory(strigiutils) | ||
| 13 | 14 | add_subdirectory(strigidaemon) | |
| 14 | 15 | add_subdirectory(strigiclient) |
strigiutils/CMakeLists.txt
(35 / 0)
|   | |||
| 1 | project (strigiutils) | ||
| 2 | |||
| 3 | ##### cmake settings ##### | ||
| 4 | |||
| 5 | cmake_minimum_required(VERSION 2.6) | ||
| 6 | set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") | ||
| 7 | #include(MacroCheckGccVisibility) | ||
| 8 | #include(MacroFindOptionalDep) | ||
| 9 | enable_testing() | ||
| 10 | |||
| 11 | |||
| 12 | ##### global variables ##### | ||
| 13 | |||
| 14 | |||
| 15 | ##### environment inspection ##### | ||
| 16 | |||
| 17 | # check for required packages | ||
| 18 | if(STRIGI_VERSION_STRING) | ||
| 19 | # if STRIGI_VERSION_STRING is defined, we are compiling the meta-package | ||
| 20 | set(LIBSTREAMS_INCLUDES | ||
| 21 | ../libstreams/include | ||
| 22 | ${CMAKE_CURRENT_BINARY_DIR}/../libstreams/include) | ||
| 23 | set(LIBSTREAMANALYZER_INCLUDES | ||
| 24 | ../libstreamanalyzer/include | ||
| 25 | ${CMAKE_CURRENT_BINARY_DIR}/../libstreamanalyzer/include) | ||
| 26 | else(STRIGI_VERSION_STRING) | ||
| 27 | find_package(libstreams REQUIRED) | ||
| 28 | find_package(libstreamanalyzer REQUIRED) | ||
| 29 | endif(STRIGI_VERSION_STRING) | ||
| 30 | |||
| 31 | ##### building and testing ##### | ||
| 32 | include_directories(${LIBSTREAMS_INCLUDES}) | ||
| 33 | include_directories(${LIBSTREAMANALYZER_INCLUDES}) | ||
| 34 | |||
| 35 | add_subdirectory(bin) |
|   | |||
| 1 | add_subdirectory(deepfind) | ||
| 2 | add_subdirectory(xmlindexer) |
|   | |||
| 1 | if(NOT HAVE_REGEX_H) | ||
| 2 | set(REGEX_INCLUDE_DIR ${strigi_SOURCE_DIR}/src/streams/strigi/regex) | ||
| 3 | set(REGEX_SOURCES ${REGEX_INCLUDE_DIR}/regex.c) | ||
| 4 | # TODO: install copyright file ! | ||
| 5 | endif(NOT HAVE_REGEX_H) | ||
| 6 | |||
| 7 | include_directories( | ||
| 8 | ../streamanalyzer | ||
| 9 | ../streams | ||
| 10 | ${REGEX_INCLUDE_DIR} | ||
| 11 | ${strigi_BINARY_DIR}/src/streams | ||
| 12 | ) | ||
| 13 | |||
| 14 | add_executable(dummyindexer dummyindexer.cpp) | ||
| 15 | target_link_libraries(dummyindexer streamanalyzer) | ||
| 16 | |||
| 17 | if(BUILD_DEEPTOOLS) | ||
| 18 | add_executable(deepfind deepfind.cpp) | ||
| 19 | target_link_libraries(deepfind streamanalyzer) | ||
| 20 | install(TARGETS deepfind RUNTIME DESTINATION bin) | ||
| 21 | endif (BUILD_DEEPTOOLS) | ||
| 22 | |||
| 23 | add_library(grepindexer STATIC grepindexreader.cpp grepindexmanager.cpp | ||
| 24 | grepindexwriter.cpp ${REGEX_SOURCES}) | ||
| 25 | |||
| 26 | if(BUILD_DEEPTOOLS) | ||
| 27 | add_executable(deepgrep deepgrep.cpp) | ||
| 28 | target_link_libraries(deepgrep grepindexer streamanalyzer ${REGEX_LIBRARIES}) | ||
| 29 | install(TARGETS deepgrep RUNTIME DESTINATION bin) | ||
| 30 | endif (BUILD_DEEPTOOLS) | ||
| 31 | |||
| 32 | add_executable(greptest grepindexreader.cpp) | ||
| 33 | target_link_libraries(greptest streamanalyzer) | ||
| 34 | |||
| 35 | if(NOT MSVC) | ||
| 36 | add_executable(analyzerlatencytester analyzerlatencytester.cpp) | ||
| 37 | target_link_libraries(analyzerlatencytester streamanalyzer) | ||
| 38 | endif(NOT MSVC) | ||
| 39 | |||
| 40 | add_library(grepindex STATIC grepindexmanager.cpp) |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2008 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <strigi/strigiconfig.h> | ||
| 22 | #include <strigi/indexmanager.h> | ||
| 23 | #include <strigi/indexwriter.h> | ||
| 24 | #include <strigi/diranalyzer.h> | ||
| 25 | #include <strigi/analyzerconfiguration.h> | ||
| 26 | #include <iostream> | ||
| 27 | #include <map> | ||
| 28 | #include <cassert> | ||
| 29 | #include <cmath> | ||
| 30 | #include <climits> | ||
| 31 | #include <sys/time.h> | ||
| 32 | #include <time.h> | ||
| 33 | |||
| 34 | using namespace Strigi; | ||
| 35 | using namespace std; | ||
| 36 | |||
| 37 | float | ||
| 38 | elapsed(const struct timeval& a, const struct timeval& b) { | ||
| 39 | return (float)(a.tv_sec - b.tv_sec) | ||
| 40 | + (float)(a.tv_usec - b.tv_usec) / 1.0e6f; | ||
| 41 | } | ||
| 42 | |||
| 43 | class LatencyMeasurer : public AnalyzerConfiguration { | ||
| 44 | private: | ||
| 45 | class Private; | ||
| 46 | Private* const d; | ||
| 47 | |||
| 48 | // We implement this function so we can count the number of files analyzed. | ||
| 49 | bool indexFile(const char* path, const char* filename) const; | ||
| 50 | // This is the function that analyzers should call often to reduce latency. | ||
| 51 | // Here we always let the measurement continue so we can measure more. | ||
| 52 | bool indexMore() const; | ||
| 53 | public: | ||
| 54 | LatencyMeasurer(); | ||
| 55 | ~LatencyMeasurer(); | ||
| 56 | void printReport(); | ||
| 57 | }; | ||
| 58 | |||
| 59 | class LatencyMeasurer::Private { | ||
| 60 | public: | ||
| 61 | struct timeval starttime, lasttime; | ||
| 62 | int32_t numberOfChecks; | ||
| 63 | long numberOfFiles; | ||
| 64 | map<int, int> histogram; | ||
| 65 | string beforeLastFile; | ||
| 66 | struct timeval beforeLastTime; | ||
| 67 | string lastFile; | ||
| 68 | struct timeval lastTime; | ||
| 69 | Private() :numberOfChecks(0), numberOfFiles(0) { | ||
| 70 | starttime.tv_sec = -1; | ||
| 71 | } | ||
| 72 | void print(); | ||
| 73 | }; | ||
| 74 | // We implement this function so we can count the number of files analyzed. | ||
| 75 | bool | ||
| 76 | LatencyMeasurer::indexFile(const char* path, const char* filename) const { | ||
| 77 | d->beforeLastFile.assign(d->lastFile); | ||
| 78 | d->beforeLastTime = d->lastTime; | ||
| 79 | d->lastFile.assign(path); | ||
| 80 | gettimeofday(&d->lastTime, NULL); | ||
| 81 | d->numberOfFiles++; | ||
| 82 | return true; | ||
| 83 | } | ||
| 84 | // This is the function that analyzers should call often to reduce latency. | ||
| 85 | // Here we always let the measurement continue so we can measure more. | ||
| 86 | bool | ||
| 87 | LatencyMeasurer::indexMore() const { | ||
| 88 | d->numberOfChecks++; | ||
| 89 | struct timeval now; | ||
| 90 | gettimeofday(&now, NULL); | ||
| 91 | if (d->starttime.tv_sec == -1) { | ||
| 92 | d->lasttime = d->starttime = now; | ||
| 93 | } | ||
| 94 | d->histogram[static_cast<int>(10*log10(elapsed(now, d->lasttime)))]++; | ||
| 95 | if (elapsed(now, d->lasttime) > 1) { | ||
| 96 | cerr << d->beforeLastFile << " started " | ||
| 97 | << elapsed(now, d->beforeLastTime) << " seconds ago." << endl; | ||
| 98 | cerr << d->lastFile << " started " | ||
| 99 | << elapsed(now, d->lastTime) << " seconds ago." << endl; | ||
| 100 | assert(elapsed(now, d->lasttime) < 3); | ||
| 101 | } | ||
| 102 | d->lasttime = now; | ||
| 103 | return true; | ||
| 104 | } | ||
| 105 | LatencyMeasurer::LatencyMeasurer() :d(new Private()) { | ||
| 106 | } | ||
| 107 | LatencyMeasurer::~LatencyMeasurer() { | ||
| 108 | delete d; | ||
| 109 | } | ||
| 110 | void | ||
| 111 | LatencyMeasurer::printReport() { | ||
| 112 | d->print(); | ||
| 113 | } | ||
| 114 | void | ||
| 115 | LatencyMeasurer::Private::print() { | ||
| 116 | struct timeval now; | ||
| 117 | gettimeofday(&now, NULL); | ||
| 118 | cout << numberOfChecks << " checks in " << numberOfFiles << " files." | ||
| 119 | << endl; | ||
| 120 | cout << "On average " << (elapsed(now, starttime)/(float)numberOfChecks) | ||
| 121 | << " seconds between checks." << endl; | ||
| 122 | int smallestTime = INT_MAX; | ||
| 123 | int largestTime = INT_MIN; | ||
| 124 | double total = 0; | ||
| 125 | for (map<int,int>::const_iterator i = histogram.begin(); | ||
| 126 | i != histogram.end(); ++i) { | ||
| 127 | int n = i->first; | ||
| 128 | total += pow(10.0, 0.1*n) * histogram[n]; | ||
| 129 | if (n > largestTime && n < 1000) largestTime = n; | ||
| 130 | if (n < smallestTime && n > -1000) smallestTime = n; | ||
| 131 | } | ||
| 132 | double sum = 0; | ||
| 133 | for (int n=smallestTime; n<=largestTime; ++n) { | ||
| 134 | sum += pow(10.0,0.1*n) * histogram[n]/total; | ||
| 135 | cout << pow(10.0,0.1*n) << '\t' << 1-sum << endl; | ||
| 136 | } | ||
| 137 | } | ||
| 138 | |||
| 139 | class DummyWriter : public IndexWriter { | ||
| 140 | private: | ||
| 141 | void startAnalysis(const AnalysisResult*) {} | ||
| 142 | void addText(const AnalysisResult*, const char*, int32_t) {} | ||
| 143 | void addValue(const AnalysisResult*, const RegisteredField*, const string&) {} | ||
| 144 | void addValue(const AnalysisResult*, const RegisteredField*, const unsigned char*, uint32_t) {} | ||
| 145 | void addValue(const AnalysisResult*, const RegisteredField*, int32_t) {} | ||
| 146 | void addValue(const AnalysisResult*, const RegisteredField*, uint32_t) {} | ||
| 147 | void addValue(const AnalysisResult*, const RegisteredField*, double) {} | ||
| 148 | void addValue(const AnalysisResult*, const RegisteredField*, const string&, const string&) {} | ||
| 149 | void finishAnalysis(const AnalysisResult*) {} | ||
| 150 | void addTriplet(const string&, const string&, const string&) {} | ||
| 151 | void deleteEntries(const vector<string>&) {} | ||
| 152 | void deleteAllEntries() {} | ||
| 153 | }; | ||
| 154 | |||
| 155 | class DummyManager : public IndexManager { | ||
| 156 | private: | ||
| 157 | DummyWriter dummywriter; | ||
| 158 | IndexReader* indexReader() { return 0; } | ||
| 159 | IndexWriter* indexWriter() { return &dummywriter; } | ||
| 160 | }; | ||
| 161 | |||
| 162 | int | ||
| 163 | main(int argc, char** argv) { | ||
| 164 | if (argc == 1) { | ||
| 165 | cerr << argv[0] | ||
| 166 | << " is a tool for testing the latency of the analyzers." << endl; | ||
| 167 | cerr << "Provide a directory to test on." << endl; | ||
| 168 | return 1; | ||
| 169 | } | ||
| 170 | |||
| 171 | LatencyMeasurer measurer; | ||
| 172 | DummyManager manager; | ||
| 173 | DirAnalyzer analyzer(manager, measurer); | ||
| 174 | int nthreads = 1; | ||
| 175 | for (int32_t i=1; i<argc; ++i) { | ||
| 176 | analyzer.analyzeDir(argv[i], nthreads); | ||
| 177 | } | ||
| 178 | |||
| 179 | measurer.printReport(); | ||
| 180 | return 0; | ||
| 181 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include <strigi/strigiconfig.h> | ||
| 21 | #include "dummyindexwriter.h" | ||
| 22 | #include <strigi/streamanalyzer.h> | ||
| 23 | #include <strigi/analyzerconfiguration.h> | ||
| 24 | #include <strigi/streamendanalyzer.h> | ||
| 25 | #include <strigi/diranalyzer.h> | ||
| 26 | using namespace Strigi; | ||
| 27 | using namespace std; | ||
| 28 | |||
| 29 | /** | ||
| 30 | * Special indexer that indexes only the filenames. | ||
| 31 | **/ | ||
| 32 | class FindIndexerConfiguration : public AnalyzerConfiguration { | ||
| 33 | public: | ||
| 34 | bool useFactory(StreamEndAnalyzerFactory* e) const { | ||
| 35 | return e->analyzesSubStreams(); | ||
| 36 | } | ||
| 37 | bool useFactory(StreamThroughAnalyzerFactory*) const {return false;} | ||
| 38 | bool indexMore() const {return true;} | ||
| 39 | bool addMoreText() const {return false;} | ||
| 40 | FieldType indexType(const string& fieldname) const { | ||
| 41 | return None; | ||
| 42 | } | ||
| 43 | }; | ||
| 44 | |||
| 45 | void | ||
| 46 | printUsage(char** argv) { | ||
| 47 | fprintf(stderr, "Usage: %s [dir-or-file-to-find]\n", argv[0]); | ||
| 48 | } | ||
| 49 | bool | ||
| 50 | containsHelp(int argc, char **argv) { | ||
| 51 | for (int i=1; i<argc; ++i) { | ||
| 52 | if (strcmp(argv[i], "--help") == 0 | ||
| 53 | || strcmp(argv[i], "-h") == 0) return true; | ||
| 54 | } | ||
| 55 | return false; | ||
| 56 | } | ||
| 57 | |||
| 58 | int | ||
| 59 | main(int argc, char **argv) { | ||
| 60 | const char* path = "."; | ||
| 61 | if (containsHelp(argc, argv) || argc > 2) { | ||
| 62 | printUsage(argv); | ||
| 63 | return -1; | ||
| 64 | } | ||
| 65 | if (argc == 2) { | ||
| 66 | path = argv[1]; | ||
| 67 | } | ||
| 68 | |||
| 69 | DummyIndexManager manager(1); | ||
| 70 | FindIndexerConfiguration conf; | ||
| 71 | DirAnalyzer analyzer(manager, conf); | ||
| 72 | analyzer.analyzeDir(path, 1); | ||
| 73 | return 0; | ||
| 74 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include <stdio.h> | ||
| 21 | |||
| 22 | #include <strigi/strigiconfig.h> | ||
| 23 | #include "grepindexmanager.h" | ||
| 24 | #include <strigi/diranalyzer.h> | ||
| 25 | #include <strigi/analyzerconfiguration.h> | ||
| 26 | #include <iostream> | ||
| 27 | #include <cstring> | ||
| 28 | using namespace Strigi; | ||
| 29 | using namespace std; | ||
| 30 | |||
| 31 | void | ||
| 32 | printUsage(char** argv) { | ||
| 33 | fprintf(stderr, "Usage: %s [--fields] [--help] PATTERN [dir-or-file-to-grep]\n" | ||
| 34 | " --fields print the list of fields\n" | ||
| 35 | " --help print this help screen\n", | ||
| 36 | argv[0]); | ||
| 37 | } | ||
| 38 | bool | ||
| 39 | containsArgument(int argc, char **argv, const char* arg, const char* a=0) { | ||
| 40 | for (int i=1; i<argc; ++i) { | ||
| 41 | if (strcmp(argv[i], arg) == 0 | ||
| 42 | || (a && strcmp(argv[i], a) == 0)) return true; | ||
| 43 | } | ||
| 44 | return false; | ||
| 45 | } | ||
| 46 | bool | ||
| 47 | containsHelp(int argc, char **argv) { | ||
| 48 | return containsArgument(argc, argv, "--help", "-h"); | ||
| 49 | } | ||
| 50 | bool | ||
| 51 | containsFieldList(int argc, char **argv) { | ||
| 52 | return containsArgument(argc, argv, "--fields", "-f"); | ||
| 53 | } | ||
| 54 | |||
| 55 | void | ||
| 56 | printFields(AnalyzerConfiguration& conf) { | ||
| 57 | const map<string, RegisteredField*>& fields | ||
| 58 | = conf.fieldRegister().fields(); | ||
| 59 | map<string, RegisteredField*>::const_iterator i; | ||
| 60 | for (i = fields.begin(); i != fields.end(); ++i) { | ||
| 61 | cout << i->first << endl; | ||
| 62 | } | ||
| 63 | } | ||
| 64 | |||
| 65 | int | ||
| 66 | main(int argc, char** argv) { | ||
| 67 | AnalyzerConfiguration ic; | ||
| 68 | if (containsFieldList(argc, argv)) { | ||
| 69 | printFields(ic); | ||
| 70 | return 0; | ||
| 71 | } | ||
| 72 | if (containsHelp(argc, argv) || argc < 2) { | ||
| 73 | printUsage(argv); | ||
| 74 | return -1; | ||
| 75 | } | ||
| 76 | GrepIndexManager manager(argv[1]); | ||
| 77 | |||
| 78 | DirAnalyzer analyzer(manager, ic); | ||
| 79 | int nthreads = 8; | ||
| 80 | if (argc > 2) { | ||
| 81 | for (int32_t i=2; i<argc; ++i) { | ||
| 82 | analyzer.analyzeDir(argv[i], nthreads); | ||
| 83 | } | ||
| 84 | } else { | ||
| 85 | analyzer.analyzeDir(".", nthreads); | ||
| 86 | } | ||
| 87 | return 0; | ||
| 88 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "dummyindexwriter.h" | ||
| 21 | #include <strigi/strigiconfig.h> | ||
| 22 | #include <strigi/diranalyzer.h> | ||
| 23 | #include <strigi/analyzerconfiguration.h> | ||
| 24 | |||
| 25 | #include <stdlib.h> | ||
| 26 | |||
| 27 | void | ||
| 28 | printUsage(char** argv) { | ||
| 29 | fprintf(stderr, "Usage: %s [-v verbosity] [dir-to-index]\n", argv[0]); | ||
| 30 | } | ||
| 31 | |||
| 32 | int | ||
| 33 | main(int argc, char **argv) { | ||
| 34 | if (argc != 2 && argc != 4) { | ||
| 35 | printUsage(argv); | ||
| 36 | return -1; | ||
| 37 | } | ||
| 38 | int verbosity = 0; | ||
| 39 | if (argc == 4) { | ||
| 40 | if (std::strcmp("-v", argv[1])) { | ||
| 41 | printUsage(argv); | ||
| 42 | return -1; | ||
| 43 | } | ||
| 44 | verbosity = atoi(argv[2]); | ||
| 45 | } | ||
| 46 | |||
| 47 | DummyIndexManager manager(verbosity); | ||
| 48 | Strigi::AnalyzerConfiguration ic; | ||
| 49 | Strigi::DirAnalyzer analyzer(manager, ic); | ||
| 50 | analyzer.analyzeDir(argv[argc-1]); | ||
| 51 | return 0; | ||
| 52 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef DUMMYINDEXWRITER_H | ||
| 21 | #define DUMMYINDEXWRITER_H | ||
| 22 | |||
| 23 | #include <strigi/analysisresult.h> | ||
| 24 | #include <strigi/indexwriter.h> | ||
| 25 | #include <strigi/indexmanager.h> | ||
| 26 | #include <strigi/fieldtypes.h> | ||
| 27 | #include <cstring> | ||
| 28 | |||
| 29 | class DummyIndexWriter : public Strigi::IndexWriter { | ||
| 30 | private: | ||
| 31 | int verbosity; | ||
| 32 | protected: | ||
| 33 | void startAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 34 | if (verbosity >= 1) { | ||
| 35 | printf("%s\n", ar->path().c_str()); | ||
| 36 | } | ||
| 37 | if (verbosity == -1) { // sha1 mode | ||
| 38 | std::string* s = new std::string(); | ||
| 39 | ar->setWriterData(s); | ||
| 40 | } | ||
| 41 | } | ||
| 42 | void finishAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 43 | if (verbosity == -1) { // sha1 mode | ||
| 44 | const std::string* s = static_cast<const std::string*>( | ||
| 45 | ar->writerData()); | ||
| 46 | printf("%s\t%s\n", ar->path().c_str(), s->c_str()); | ||
| 47 | delete s; | ||
| 48 | } | ||
| 49 | } | ||
| 50 | void addText(const Strigi::AnalysisResult* ar, const char* text, | ||
| 51 | int32_t length) { | ||
| 52 | if (verbosity > 2) { | ||
| 53 | printf("%s: addText '%.*s'\n", ar->path().c_str(), length, | ||
| 54 | text); | ||
| 55 | } | ||
| 56 | } | ||
| 57 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 58 | const Strigi::RegisteredField* field, const std::string& value) { | ||
| 59 | if (verbosity > 1) { | ||
| 60 | printf("%s: setField '%s': '%s'\n", ar->path().c_str(), | ||
| 61 | field->key().c_str(), value.c_str()); | ||
| 62 | } else if (verbosity == -1 | ||
| 63 | && std::strcmp(field->key().c_str(), "sha1") == 0) { | ||
| 64 | std::string* s = static_cast<std::string*>(ar->writerData()); | ||
| 65 | *s = value; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 69 | const Strigi::RegisteredField* fieldname, const unsigned char* data, | ||
| 70 | uint32_t size) {} | ||
| 71 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 72 | const Strigi::RegisteredField* fieldname, uint32_t value) {} | ||
| 73 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 74 | const Strigi::RegisteredField* fieldname, int32_t value) {} | ||
| 75 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 76 | const Strigi::RegisteredField* fieldname, double value) {} | ||
| 77 | void addTriplet(const std::string& subject, | ||
| 78 | const std::string& predicate, const std::string& object) {} | ||
| 79 | void addValue(const Strigi::AnalysisResult*, | ||
| 80 | const Strigi::RegisteredField* field, const std::string& name, | ||
| 81 | const std::string& value) {} | ||
| 82 | public: | ||
| 83 | DummyIndexWriter(int v = 0) { | ||
| 84 | verbosity = v; | ||
| 85 | } | ||
| 86 | ~DummyIndexWriter() {} | ||
| 87 | void commit() {} | ||
| 88 | void deleteEntries(const std::vector<std::string>& entries) {} | ||
| 89 | void deleteAllEntries() {} | ||
| 90 | }; | ||
| 91 | |||
| 92 | class DummyIndexManager : public Strigi::IndexManager { | ||
| 93 | private: | ||
| 94 | DummyIndexWriter writer; | ||
| 95 | public: | ||
| 96 | DummyIndexManager(int level) :writer(level) {} | ||
| 97 | Strigi::IndexWriter* indexWriter() { | ||
| 98 | return &writer; | ||
| 99 | } | ||
| 100 | Strigi::IndexReader* indexReader() { | ||
| 101 | return 0; | ||
| 102 | } | ||
| 103 | }; | ||
| 104 | |||
| 105 | #endif |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "grepindexmanager.h" | ||
| 21 | #include "grepindexreader.h" | ||
| 22 | #include "grepindexwriter.h" | ||
| 23 | #include <strigi/indexwriter.h> | ||
| 24 | using namespace Strigi; | ||
| 25 | |||
| 26 | GrepIndexManager::GrepIndexManager(const char* regex) | ||
| 27 | :reader(0), | ||
| 28 | writer(new GrepIndexWriter(regex)){ | ||
| 29 | } | ||
| 30 | GrepIndexManager::~GrepIndexManager() { | ||
| 31 | delete writer; | ||
| 32 | } | ||
| 33 | Strigi::IndexReader* | ||
| 34 | GrepIndexManager::indexReader() { | ||
| 35 | return reader; | ||
| 36 | } | ||
| 37 | Strigi::IndexWriter* | ||
| 38 | GrepIndexManager::indexWriter() { | ||
| 39 | return writer; | ||
| 40 | } | ||
| 41 | Strigi::IndexManager* | ||
| 42 | createGrepIndexManager(const char* path) { | ||
| 43 | return new GrepIndexManager(path); | ||
| 44 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef GREPINDEXMANAGER_H | ||
| 21 | #define GREPINDEXMANAGER_H | ||
| 22 | |||
| 23 | #include <strigi/strigiconfig.h> | ||
| 24 | #include <strigi/indexmanager.h> | ||
| 25 | |||
| 26 | class GrepIndexManager : public Strigi::IndexManager { | ||
| 27 | private: | ||
| 28 | Strigi::IndexReader* const reader; | ||
| 29 | Strigi::IndexWriter* const writer; | ||
| 30 | public: | ||
| 31 | explicit GrepIndexManager(const char* regex); | ||
| 32 | ~GrepIndexManager(); | ||
| 33 | |||
| 34 | Strigi::IndexReader* indexReader(); | ||
| 35 | Strigi::IndexWriter* indexWriter(); | ||
| 36 | }; | ||
| 37 | |||
| 38 | Strigi::IndexManager* | ||
| 39 | createGrepIndexManager(const char* path); | ||
| 40 | |||
| 41 | #endif |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "grepindexreader.h" | ||
| 21 | #include <strigi/analyzerconfiguration.h> | ||
| 22 | #include <strigi/query.h> | ||
| 23 | #include <strigi/variant.h> | ||
| 24 | #include <strigi/indexwriter.h> | ||
| 25 | #include <strigi/filelister.h> | ||
| 26 | #include <set> | ||
| 27 | using namespace std; | ||
| 28 | using namespace Strigi; | ||
| 29 | |||
| 30 | /** | ||
| 31 | * Custom configuration that extracts specific fields. | ||
| 32 | **/ | ||
| 33 | class FieldAnalyzerConfiguration : public AnalyzerConfiguration { | ||
| 34 | private: | ||
| 35 | const set<string> neededFields; | ||
| 36 | mutable set<string> availableFields; | ||
| 37 | signed char m_maxDepth; | ||
| 38 | const bool needsAllFields; | ||
| 39 | public: | ||
| 40 | FieldAnalyzerConfiguration(const set<string>& fields); | ||
| 41 | bool useFactory(StreamAnalyzerFactory*) const; | ||
| 42 | bool useFactory(StreamEndAnalyzerFactory*) const; | ||
| 43 | bool hasAllFields() const; | ||
| 44 | }; | ||
| 45 | FieldAnalyzerConfiguration::FieldAnalyzerConfiguration(const set<string>& f) | ||
| 46 | :neededFields(f), m_maxDepth(-1), needsAllFields(f.find("")!=f.end()) { | ||
| 47 | } | ||
| 48 | bool | ||
| 49 | FieldAnalyzerConfiguration::useFactory(StreamAnalyzerFactory* f) const { | ||
| 50 | bool use = needsAllFields; | ||
| 51 | const vector<const RegisteredField*>& fields = f->registeredFields(); | ||
| 52 | vector<const RegisteredField*>::const_iterator i; | ||
| 53 | for (i = fields.begin(); i != fields.end(); ++i) { | ||
| 54 | const RegisteredField* field = *i; | ||
| 55 | do { | ||
| 56 | if (neededFields.find(field->key()) != neededFields.end()) { | ||
| 57 | availableFields.insert(field->key()); | ||
| 58 | use = true; | ||
| 59 | } | ||
| 60 | field = field->parent(); | ||
| 61 | } while (field); | ||
| 62 | } | ||
| 63 | return use; | ||
| 64 | } | ||
| 65 | bool | ||
| 66 | FieldAnalyzerConfiguration::useFactory(StreamEndAnalyzerFactory* f) const { | ||
| 67 | return f->analyzesSubStreams() || static_cast<StreamAnalyzerFactory*>(f); | ||
| 68 | } | ||
| 69 | |||
| 70 | class QueryIndexWriter : public IndexWriter { | ||
| 71 | public: | ||
| 72 | void startAnalysis(const AnalysisResult*) {} | ||
| 73 | void addText(const AnalysisResult* result, const char* text, int32_t length) {} | ||
| 74 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 75 | const std::string& value) {} | ||
| 76 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 77 | const unsigned char* data, uint32_t size) {} | ||
| 78 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 79 | int32_t value) {} | ||
| 80 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 81 | uint32_t value) {} | ||
| 82 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 83 | double value) {} | ||
| 84 | void addValue(const AnalysisResult* result, const RegisteredField* field, | ||
| 85 | const std::string& name, const std::string& value) {} | ||
| 86 | void finishAnalysis(const AnalysisResult* result) {} | ||
| 87 | void addTriplet(const std::string& subject, | ||
| 88 | const std::string& predicate, const std::string& object) {} | ||
| 89 | public: | ||
| 90 | void commit() { return; } | ||
| 91 | void deleteEntries(const std::vector<std::string>& entries) {} | ||
| 92 | void deleteAllEntries() {} | ||
| 93 | /** | ||
| 94 | * @brief Return the number of objects that are currently in the cache. | ||
| 95 | **/ | ||
| 96 | virtual int itemsInCache() { return 0; } | ||
| 97 | void optimize() {} | ||
| 98 | void initWriterData(const Strigi::FieldRegister& fieldRegister) {} | ||
| 99 | void releaseWriterData(const Strigi::FieldRegister& fieldRegister) {} | ||
| 100 | }; | ||
| 101 | |||
| 102 | class GrepIndexReader::Private { | ||
| 103 | public: | ||
| 104 | const string dir; | ||
| 105 | |||
| 106 | Private(const string& d) :dir(d) {} | ||
| 107 | }; | ||
| 108 | |||
| 109 | GrepIndexReader::GrepIndexReader(const string& dir) :p(new Private(dir)) { | ||
| 110 | } | ||
| 111 | GrepIndexReader::~GrepIndexReader() { | ||
| 112 | delete p; | ||
| 113 | } | ||
| 114 | void | ||
| 115 | getFields(set<string>& fields, const Query& query) { | ||
| 116 | copy(query.fields().begin(), query.fields().end(), | ||
| 117 | inserter(fields, fields.begin())); | ||
| 118 | for (vector<Query>::const_iterator i = query.subQueries().begin(); | ||
| 119 | i != query.subQueries().end(); ++i) { | ||
| 120 | getFields(fields, *i); | ||
| 121 | } | ||
| 122 | } | ||
| 123 | int32_t | ||
| 124 | GrepIndexReader::countHits(const Query& query) { | ||
| 125 | QueryIndexWriter qiw; | ||
| 126 | // make an analyzerconfiguration with a limited set of fields | ||
| 127 | set<string> fields; | ||
| 128 | getFields(fields, query); | ||
| 129 | FieldAnalyzerConfiguration conf(fields); | ||
| 130 | StreamAnalyzer analyzer(conf); | ||
| 131 | analyzer.setIndexWriter(qiw); | ||
| 132 | return -1; | ||
| 133 | } | ||
| 134 | vector<IndexedDocument> | ||
| 135 | GrepIndexReader::query(const Query&, int offset, int max) { | ||
| 136 | vector<IndexedDocument> hits; | ||
| 137 | return hits; | ||
| 138 | } | ||
| 139 | void | ||
| 140 | GrepIndexReader::getHits(const Strigi::Query&, | ||
| 141 | const std::vector<std::string>& fields, | ||
| 142 | const std::vector<Strigi::Variant::Type>& types, | ||
| 143 | std::vector<std::vector<Strigi::Variant> >& result, int off, int max) { | ||
| 144 | result.clear(); | ||
| 145 | } | ||
| 146 | map<string, time_t> | ||
| 147 | GrepIndexReader::files(char depth) { | ||
| 148 | map<string, time_t> files; | ||
| 149 | return files; | ||
| 150 | } | ||
| 151 | int32_t | ||
| 152 | GrepIndexReader::countDocuments() { return -1; } | ||
| 153 | int32_t | ||
| 154 | GrepIndexReader::countWords() { return -1; } | ||
| 155 | int64_t | ||
| 156 | GrepIndexReader::indexSize() { | ||
| 157 | // we have no index :-) | ||
| 158 | return 0; | ||
| 159 | } | ||
| 160 | /** | ||
| 161 | * This does not have to be implemented since we have not index. | ||
| 162 | **/ | ||
| 163 | time_t | ||
| 164 | GrepIndexReader::mTime(const std::string& uri) { | ||
| 165 | return -1; | ||
| 166 | } | ||
| 167 | vector<string> | ||
| 168 | GrepIndexReader::fieldNames() { | ||
| 169 | vector<string> fieldnames; | ||
| 170 | return fieldnames; | ||
| 171 | } | ||
| 172 | vector<pair<string,uint32_t> > | ||
| 173 | GrepIndexReader::histogram(const string& query, const string& fieldname, | ||
| 174 | const string& labeltype) { | ||
| 175 | vector<pair<string,uint32_t> > histogram; | ||
| 176 | return histogram; | ||
| 177 | } | ||
| 178 | int32_t | ||
| 179 | GrepIndexReader::countKeywords(const string& keywordprefix, | ||
| 180 | const vector<string>& fieldnames) { | ||
| 181 | return -1; | ||
| 182 | } | ||
| 183 | vector<string> | ||
| 184 | GrepIndexReader::keywords( | ||
| 185 | const string& keywordmatch, | ||
| 186 | const vector<string>& fieldnames, | ||
| 187 | uint32_t max, uint32_t offset) { | ||
| 188 | vector<string> keywords; | ||
| 189 | return keywords; | ||
| 190 | } | ||
| 191 | int | ||
| 192 | main() { | ||
| 193 | GrepIndexReader("/home"); | ||
| 194 | return 0; | ||
| 195 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef GREPINDEXREADER_H | ||
| 21 | #define GREPINDEXREADER_H | ||
| 22 | |||
| 23 | #include <strigi/strigiconfig.h> | ||
| 24 | #include <strigi/indexreader.h> | ||
| 25 | |||
| 26 | #include <map> | ||
| 27 | #include <time.h> | ||
| 28 | |||
| 29 | class GrepIndexReader : public Strigi::IndexReader { | ||
| 30 | private: | ||
| 31 | class Private; | ||
| 32 | Private* const p; | ||
| 33 | public: | ||
| 34 | GrepIndexReader(const std::string& dir); | ||
| 35 | ~GrepIndexReader(); | ||
| 36 | int32_t countHits(const Strigi::Query& query); | ||
| 37 | std::vector<Strigi::IndexedDocument> query(const Strigi::Query&, int offset, | ||
| 38 | int max); | ||
| 39 | void getHits(const Strigi::Query&, const std::vector<std::string>& fields, | ||
| 40 | const std::vector<Strigi::Variant::Type>& types, | ||
| 41 | std::vector<std::vector<Strigi::Variant> >& result, int off, int max); | ||
| 42 | std::map<std::string, time_t> files(char depth); | ||
| 43 | int32_t countDocuments(); | ||
| 44 | int32_t countWords(); | ||
| 45 | int64_t indexSize(); | ||
| 46 | time_t mTime(const std::string& uri); | ||
| 47 | std::vector<std::string> fieldNames(); | ||
| 48 | std::vector<std::pair<std::string,uint32_t> > histogram( | ||
| 49 | const std::string& query, const std::string& fieldname, | ||
| 50 | const std::string& labeltype); | ||
| 51 | int32_t countKeywords(const std::string& keywordprefix, | ||
| 52 | const std::vector<std::string>& fieldnames); | ||
| 53 | std::vector<std::string> keywords( | ||
| 54 | const std::string& keywordmatch, | ||
| 55 | const std::vector<std::string>& fieldnames, | ||
| 56 | uint32_t max, uint32_t offset); | ||
| 57 | }; | ||
| 58 | |||
| 59 | #endif |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "grepindexwriter.h" | ||
| 21 | #include <strigi/analysisresult.h> | ||
| 22 | #include <strigi/fieldtypes.h> | ||
| 23 | #include <regex.h> | ||
| 24 | using namespace std; | ||
| 25 | |||
| 26 | class GrepIndexWriter::Private | ||
| 27 | { | ||
| 28 | public: | ||
| 29 | Private() {} | ||
| 30 | regex_t regex; | ||
| 31 | }; | ||
| 32 | |||
| 33 | GrepIndexWriter::GrepIndexWriter(const char* re) | ||
| 34 | : d(new Private()) { | ||
| 35 | regcomp(&d->regex, re, REG_NOSUB); | ||
| 36 | } | ||
| 37 | GrepIndexWriter::~GrepIndexWriter() { | ||
| 38 | regfree(&d->regex); | ||
| 39 | delete d; | ||
| 40 | } | ||
| 41 | void | ||
| 42 | GrepIndexWriter::startAnalysis(const Strigi::AnalysisResult* idx) { | ||
| 43 | } | ||
| 44 | void | ||
| 45 | GrepIndexWriter::finishAnalysis(const Strigi::AnalysisResult* idx) { | ||
| 46 | } | ||
| 47 | void | ||
| 48 | GrepIndexWriter::addText(const Strigi::AnalysisResult* idx, const char* text, | ||
| 49 | int32_t length) { | ||
| 50 | // unfortunately we have to copy the incoming stream because regexec() | ||
| 51 | // assumes a null-terminated string and we are not allowed to modify the | ||
| 52 | // incoming message | ||
| 53 | string s; | ||
| 54 | const char* start = text; | ||
| 55 | const char* end = text+length; | ||
| 56 | const char* p = start; | ||
| 57 | while (p < end) { | ||
| 58 | // look at each line separately | ||
| 59 | if (*p == '\n' || *p == '\r') { | ||
| 60 | s.assign(start, p-start); | ||
| 61 | if (regexec(&d->regex, s.c_str(), 0, 0, 0) == 0) { | ||
| 62 | printf("%s:%s\n", idx->path().c_str(), s.c_str()); | ||
| 63 | } | ||
| 64 | start = p+1; | ||
| 65 | } | ||
| 66 | p++; | ||
| 67 | } | ||
| 68 | s.assign(start, p-start); | ||
| 69 | if (regexec(&d->regex, s.c_str(), 0, 0, 0) == 0) { | ||
| 70 | printf("%s:%s\n", idx->path().c_str(), s.c_str()); | ||
| 71 | } | ||
| 72 | } | ||
| 73 | void | ||
| 74 | GrepIndexWriter::addValue(const Strigi::AnalysisResult* idx, | ||
| 75 | const Strigi::RegisteredField* field, const std::string& value) { | ||
| 76 | if (regexec(&d->regex, value.c_str(), 0, 0, 0) == 0) { | ||
| 77 | printf("%s:%s:%s\n", idx->path().c_str(), | ||
| 78 | field->key().c_str(), value.c_str()); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | void | ||
| 82 | GrepIndexWriter::addValue(const Strigi::AnalysisResult* idx, | ||
| 83 | const Strigi::RegisteredField* field, | ||
| 84 | const unsigned char* data, uint32_t size) { | ||
| 85 | if (!field->properties().binary()) { | ||
| 86 | string value((const char*)data, size); | ||
| 87 | addValue(idx, field, value); | ||
| 88 | } | ||
| 89 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef GREPINDEXWRITER_H | ||
| 21 | #define GREPINDEXWRITER_H | ||
| 22 | |||
| 23 | #include <strigi/indexwriter.h> | ||
| 24 | |||
| 25 | class GrepIndexWriter : public Strigi::IndexWriter { | ||
| 26 | private: | ||
| 27 | class Private; | ||
| 28 | Private * const d; | ||
| 29 | protected: | ||
| 30 | void startAnalysis(const Strigi::AnalysisResult* idx); | ||
| 31 | void finishAnalysis(const Strigi::AnalysisResult* idx); | ||
| 32 | void addText(const Strigi::AnalysisResult* idx, const char* text, | ||
| 33 | int32_t length); | ||
| 34 | void addValue(const Strigi::AnalysisResult* idx, | ||
| 35 | const Strigi::RegisteredField* field, const std::string& value); | ||
| 36 | void addValue(const Strigi::AnalysisResult* idx, | ||
| 37 | const Strigi::RegisteredField* field, | ||
| 38 | const unsigned char* data, uint32_t size); | ||
| 39 | void addValue(const Strigi::AnalysisResult* idx, | ||
| 40 | const Strigi::RegisteredField* field, uint32_t value) {} | ||
| 41 | void addValue(const Strigi::AnalysisResult* idx, | ||
| 42 | const Strigi::RegisteredField* field, int32_t value) {} | ||
| 43 | void addValue(const Strigi::AnalysisResult* idx, | ||
| 44 | const Strigi::RegisteredField* field, double value) {} | ||
| 45 | void addTriplet(const std::string& subject, | ||
| 46 | const std::string& predicate, const std::string& object) {} | ||
| 47 | void addValue(const Strigi::AnalysisResult*, | ||
| 48 | const Strigi::RegisteredField* field, const std::string& name, | ||
| 49 | const std::string& value) {} | ||
| 50 | public: | ||
| 51 | explicit GrepIndexWriter(const char* re); | ||
| 52 | ~GrepIndexWriter(); | ||
| 53 | void commit() {} | ||
| 54 | void deleteEntries(const std::vector<std::string>& entries) {} | ||
| 55 | void deleteAllEntries() {} | ||
| 56 | }; | ||
| 57 | |||
| 58 | #endif |
|   | |||
| 1 | set term png | ||
| 2 | set logscale x | ||
| 3 | set output 'x.png' | ||
| 4 | set xlabel 'exit time (s)' | ||
| 5 | set ylabel 'chance that exit time is larger' | ||
| 6 | set style line 1 lt 1 lw 6 | ||
| 7 | unset key | ||
| 8 | plot 'graph' |
|   | |||
| 1 | include_directories( | ||
| 2 | ../streamanalyzer | ||
| 3 | ../streams | ||
| 4 | ../streams/strigi | ||
| 5 | ${strigi_BINARY_DIR}/src/streams | ||
| 6 | ${BZIP2_INCLUDE_DIR} | ||
| 7 | ) | ||
| 8 | |||
| 9 | add_executable(rdfindexer | ||
| 10 | rdfindexer.cpp | ||
| 11 | tagmapping.cpp | ||
| 12 | rdfindexwriter.cpp | ||
| 13 | ) | ||
| 14 | |||
| 15 | target_link_libraries(rdfindexer streamanalyzer) | ||
| 16 | |||
| 17 | install(TARGETS rdfindexer RUNTIME DESTINATION bin) | ||
| 18 | |||
| 19 | add_executable(xmlindexer | ||
| 20 | xmlindexer.cpp | ||
| 21 | tagmapping.cpp | ||
| 22 | xmlindexwriter.cpp | ||
| 23 | ) | ||
| 24 | |||
| 25 | target_link_libraries(xmlindexer streamanalyzer) | ||
| 26 | |||
| 27 | add_executable(cgixmlindexer | ||
| 28 | tagmapping.cpp | ||
| 29 | xmlindexwriter.cpp | ||
| 30 | cgixmlindexer.cpp) | ||
| 31 | |||
| 32 | target_link_libraries(cgixmlindexer streamanalyzer) | ||
| 33 | |||
| 34 | install(TARGETS xmlindexer RUNTIME DESTINATION bin) | ||
| 35 | |||
| 36 | #add_executable(peranalyzerxml peranalyzerxml.cpp xmlindexwriter.cpp | ||
| 37 | # tagmapping.cpp) | ||
| 38 | #target_link_libraries(peranalyzerxml streamanalyzer) | ||
| 39 | |||
| 40 | add_executable(perfieldxml perfieldxml.cpp xmlindexwriter.cpp | ||
| 41 | tagmapping.cpp) | ||
| 42 | target_link_libraries(perfieldxml streamanalyzer) | ||
| 43 | |||
| 44 | # is this still broken on win32? | ||
| 45 | if(NOT WIN32) | ||
| 46 | add_executable(ontoprint ontoprint.cpp) | ||
| 47 | target_link_libraries(ontoprint streamanalyzer) | ||
| 48 | endif(NOT WIN32) | ||
| 49 | |||
| 50 | # register all tests based on the data in the testdata directory | ||
| 51 | #FILE(GLOB_RECURSE allfiles ../../testdata/analyzers/*/config) | ||
| 52 | FOREACH(file ${allfiles}) | ||
| 53 | GET_FILENAME_COMPONENT(testdir ${file} PATH) | ||
| 54 | GET_FILENAME_COMPONENT(dir ${testdir} NAME) | ||
| 55 | FILE(GLOB_RECURSE ofiles "${testdir}/*") | ||
| 56 | FOREACH(ofile ${ofiles}) | ||
| 57 | STRING(REPLACE "/analyzers/${dir}/" "/data/" ifile ${ofile}) | ||
| 58 | IF(NOT ${ifile} MATCHES "/.svn/" AND NOT ${ifile} MATCHES "config$") | ||
| 59 | STRING(REPLACE "*/" "" testname ${ofile}) | ||
| 60 | STRING(REPLACE ${testdir} "" testname ${ofile}) | ||
| 61 | ADD_TEST("${dir}${testname}" peranalyzerxml -c ${testdir}/config | ||
| 62 | -r ${ofile} ${ifile}) | ||
| 63 | ENDIF(NOT ${ifile} MATCHES "/.svn/" AND NOT ${ifile} MATCHES "config$") | ||
| 64 | ENDFOREACH(ofile ${ofiles}) | ||
| 65 | ENDFOREACH(file ${allfiles}) |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include "xmlindexwriter.h" | ||
| 22 | #include <strigi/stringstream.h> | ||
| 23 | #include <strigi/stringterminatedsubstream.h> | ||
| 24 | #include <strigi/subinputstream.h> | ||
| 25 | #include <time.h> | ||
| 26 | #include <iostream> | ||
| 27 | #include <cstdlib> | ||
| 28 | #include <stdlib.h> // getenv | ||
| 29 | using namespace Strigi; | ||
| 30 | using namespace std; | ||
| 31 | |||
| 32 | string | ||
| 33 | readHeader(InputStream* f) { | ||
| 34 | StringTerminatedSubStream header(f, "\r\n\r\n"); | ||
| 35 | string h; | ||
| 36 | const char* d; | ||
| 37 | int32_t nread = header.read(d, 1000, 0); | ||
| 38 | while (nread > 0) { | ||
| 39 | h.append(d, nread); | ||
| 40 | nread = header.read(d, 1000, 0); | ||
| 41 | } | ||
| 42 | return h; | ||
| 43 | } | ||
| 44 | |||
| 45 | /** | ||
| 46 | * Start parsing a file. The stream must be positioned at the start of the file | ||
| 47 | * header. | ||
| 48 | **/ | ||
| 49 | bool | ||
| 50 | parseFile(StreamAnalyzer& sa, XmlIndexManager& manager, | ||
| 51 | InputStream* f, const string& delim) { | ||
| 52 | |||
| 53 | string header = readHeader(f); | ||
| 54 | string filename; | ||
| 55 | const char* start = header.c_str(); | ||
| 56 | start = strstr(start, "filename="); | ||
| 57 | if (start) { | ||
| 58 | start += 9; | ||
| 59 | const char* end = 0; | ||
| 60 | char c = *start; | ||
| 61 | if (c == '\'' || c == '"') { | ||
| 62 | start += 1; | ||
| 63 | end = strchr(start, c); | ||
| 64 | } | ||
| 65 | if (end) { | ||
| 66 | filename.assign(start, end-start); | ||
| 67 | } else { | ||
| 68 | filename.assign(start); | ||
| 69 | } | ||
| 70 | } | ||
| 71 | |||
| 72 | // analyzer the stream | ||
| 73 | StringTerminatedSubStream stream(f, delim); | ||
| 74 | if (filename.size()) { | ||
| 75 | AnalysisResult result(filename, time(0), *manager.indexWriter(), sa); | ||
| 76 | sa.analyze(result, &stream); | ||
| 77 | } | ||
| 78 | // read the rest of the stream | ||
| 79 | const char* d; | ||
| 80 | int32_t nread = stream.read(d, 1000, 0); | ||
| 81 | while (nread > 0) { | ||
| 82 | nread = stream.read(d, 1000, 0); | ||
| 83 | } | ||
| 84 | |||
| 85 | // check if this is the last file | ||
| 86 | nread = f->read(d, 2, 2); | ||
| 87 | return nread == 2 && *d == '\r' && d[1] == '\n'; | ||
| 88 | } | ||
| 89 | |||
| 90 | int | ||
| 91 | main() { | ||
| 92 | const TagMapping mapping(0); | ||
| 93 | cout << "Content-Type:text/xml;charset=UTF-8\r\n\r\n" | ||
| 94 | "<?xml version='1.0' encoding='UTF-8'?>\n<" | ||
| 95 | << mapping.map("metadata") << ">\n"; | ||
| 96 | |||
| 97 | int len; | ||
| 98 | const char* lenstr = getenv("CONTENT_LENGTH"); | ||
| 99 | if (lenstr == NULL || sscanf(lenstr,"%id", &len) != 1 || len < 0) { | ||
| 100 | cout << " <error>input too small</error>\n</" | ||
| 101 | << mapping.map("metadata") << ">\n" << flush; | ||
| 102 | return 0; | ||
| 103 | } | ||
| 104 | cerr << "len " << len << endl; | ||
| 105 | char* e = new char[len]; | ||
| 106 | if (e == 0 || fread(e, 1, len, stdin) != (size_t)len) { | ||
| 107 | cout << " <error>cannot allocate memory</error>\n</" | ||
| 108 | << mapping.map("metadata") << ">\n" << flush; | ||
| 109 | return 0; | ||
| 110 | } | ||
| 111 | |||
| 112 | // read from stdin | ||
| 113 | StringInputStream stream(e, len); | ||
| 114 | |||
| 115 | // read the first line | ||
| 116 | const char* d = NULL; | ||
| 117 | const int32_t maxlength = 1024; | ||
| 118 | int32_t nread = stream.read(d, maxlength, maxlength); | ||
| 119 | stream.reset(0); | ||
| 120 | |||
| 121 | if (nread < 1) { | ||
| 122 | cout << " <error>input too small</error>\n</" | ||
| 123 | << mapping.map("metadata") << ">\n" << flush; | ||
| 124 | return 0; | ||
| 125 | } | ||
| 126 | |||
| 127 | // get out the delimiter | ||
| 128 | const char* end = d + nread; | ||
| 129 | const char* p = d; | ||
| 130 | while (p < end-1 && *p != '\r') p++; | ||
| 131 | if (*p != '\r' || p[1] != '\n') { | ||
| 132 | cout << " <error>no delimiter line</error></" | ||
| 133 | << mapping.map("metadata") << ">\n" << flush; | ||
| 134 | return 0; | ||
| 135 | } | ||
| 136 | string delim("\r\n"); | ||
| 137 | delim.append(d, p-d); | ||
| 138 | |||
| 139 | // skip the delimiter + '\r\n' | ||
| 140 | stream.reset(delim.length() + 2); | ||
| 141 | |||
| 142 | // parse all files | ||
| 143 | XmlIndexManager manager(cout, mapping); | ||
| 144 | AnalyzerConfiguration ac; | ||
| 145 | StreamAnalyzer sa(ac); | ||
| 146 | sa.setIndexWriter(*manager.indexWriter()); | ||
| 147 | while (parseFile(sa, manager, &stream, delim)) {}; | ||
| 148 | cout << "</" << mapping.map("metadata") << ">\n" << flush; | ||
| 149 | |||
| 150 | return 0; | ||
| 151 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include <iostream> | ||
| 21 | #include <cstdlib> | ||
| 22 | #include <cstring> | ||
| 23 | #include <string> | ||
| 24 | #include <list> | ||
| 25 | #include <algorithm> | ||
| 26 | #include <unistd.h> | ||
| 27 | #include <getopt.h> | ||
| 28 | #include <strigi/fieldpropertiesdb.h> | ||
| 29 | #include <strigi/streamanalyzer.h> | ||
| 30 | #include <strigi/analyzerconfiguration.h> | ||
| 31 | using namespace std; | ||
| 32 | using namespace Strigi; | ||
| 33 | |||
| 34 | void | ||
| 35 | printDot(ostream& out, const char* locale) { | ||
| 36 | const map<string, FieldProperties>& p | ||
| 37 | = FieldPropertiesDb::db().allProperties(); | ||
| 38 | map<string, FieldProperties>::const_iterator i; | ||
| 39 | list<string> categories; | ||
| 40 | out << "digraph{graph[rankdir=LR];" << endl; | ||
| 41 | for (i = p.begin(); i != p.end(); ++i) { | ||
| 42 | const vector<string>& parents = i->second.parentUris(); | ||
| 43 | vector<string>::const_iterator j; | ||
| 44 | out << '"' << i->second.uri() << "\" [shape=record, label =\"" << i->second.uri() << "|{type: " << i->second.typeUri() << "}\"];" << endl; | ||
| 45 | for (j = parents.begin(); j != parents.end(); ++j) { | ||
| 46 | out << '"' << *j << "\"->\"" << i->second.uri() << "\";" << endl; | ||
| 47 | } | ||
| 48 | // make link to category, e.g. chemistry for chemistry.inchi | ||
| 49 | string category = i->second.uri().substr(0,i->second.uri().find(".")); | ||
| 50 | if (category.length() != i->second.uri().length()) { | ||
| 51 | list<string>::const_iterator match = find(categories.begin(), categories.end(), category); | ||
| 52 | if (match == categories.end()) { | ||
| 53 | categories.push_back(category); | ||
| 54 | out << "\"" << category << "\" [style=filled,color=gray];" << endl; | ||
| 55 | } | ||
| 56 | out << "\"" << category << "\"->\"" << i->second.uri() << "\";" << endl; | ||
| 57 | } | ||
| 58 | } | ||
| 59 | out << "}" << endl; | ||
| 60 | } | ||
| 61 | void | ||
| 62 | printRdfsProperties(ostream& out, const FieldProperties& p) { | ||
| 63 | out << " <rdf:Property rdf:about='" << p.uri() << "'>\n" | ||
| 64 | << " <rdfs:label>" << p.name() << "</rdfs:label>\n" | ||
| 65 | << " <rdfs:comment>" << p.description() << "</rdfs:comment>\n"; | ||
| 66 | const vector<string>& parents = p.parentUris(); | ||
| 67 | vector<string>::const_iterator j; | ||
| 68 | for (j = parents.begin(); j != parents.end(); ++j){ | ||
| 69 | out << " <rdfs:subPropertyOf rdf:resource='" << *j << "'/>\n"; | ||
| 70 | } | ||
| 71 | const vector<string>& classes = p.applicableClasses(); | ||
| 72 | for (j = classes.begin(); j != classes.end(); ++j){ | ||
| 73 | out << " <rdfs:domain rdf:resource='" << *j << "'/>\n"; | ||
| 74 | } | ||
| 75 | |||
| 76 | out << " <rdfs:range rdf:resource='" << p.typeUri() << "'/>\n"; | ||
| 77 | const vector<string>& locales = p.locales(); | ||
| 78 | for (j = locales.begin(); j != locales.end(); ++j) { | ||
| 79 | const string& name = p.localizedName(*j); | ||
| 80 | if (name.size()) { | ||
| 81 | out << " <rdfs:label xml:lang='" << *j << "'>" << name << "</rdfs:label>\n"; | ||
| 82 | } | ||
| 83 | const string& description = p.localizedDescription(*j); | ||
| 84 | if (description.size()) { | ||
| 85 | out << " <rdfs:comment xml:lang='" << *j << "'>" << description << "</rdfs:comment>\n"; | ||
| 86 | } | ||
| 87 | } | ||
| 88 | out << " </rdf:Property>\n"; | ||
| 89 | } | ||
| 90 | |||
| 91 | void | ||
| 92 | printRdfsClasses(ostream& out, const ClassProperties& p) { | ||
| 93 | out << " <rdfs:Class rdf:about='" << p.uri() << "'>\n" | ||
| 94 | << " <rdfs:label>" << p.name() << "</rdfs:label>\n" | ||
| 95 | << " <rdfs:comment>" << p.description() << "</rdfs:comment>\n"; | ||
| 96 | const vector<string>& parents = p.parentUris(); | ||
| 97 | vector<string>::const_iterator j; | ||
| 98 | for (j = parents.begin(); j != parents.end(); ++j){ | ||
| 99 | out << " <rdfs:subClassOf rdf:resource='" << *j << "'/>\n"; | ||
| 100 | } | ||
| 101 | |||
| 102 | const vector<string>& locales = p.locales(); | ||
| 103 | for (j = locales.begin(); j != locales.end(); ++j) { | ||
| 104 | const string& name = p.localizedName(*j); | ||
| 105 | if (name.size()) { | ||
| 106 | out << " <rdfs:label xml:lang='" << *j << "'>" << name << "</rdfs:label>\n"; | ||
| 107 | } | ||
| 108 | const string& description = p.localizedDescription(*j); | ||
| 109 | if (description.size()) { | ||
| 110 | out << " <rdfs:comment xml:lang='" << *j << "'>" << description << "</rdfs:comment>\n"; | ||
| 111 | } | ||
| 112 | } | ||
| 113 | out << " </rdfs:Class>\n"; | ||
| 114 | } | ||
| 115 | |||
| 116 | void | ||
| 117 | printRdfs(ostream& out) { | ||
| 118 | out << "<?xml version='1.0' encoding='UTF-8'?>\n" | ||
| 119 | "<!DOCTYPE rdf:RDF [\n" | ||
| 120 | " <!ENTITY rdf 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'>\n" | ||
| 121 | " <!ENTITY strigi 'http://strigi URL goes here#'>\n" | ||
| 122 | " <!ENTITY rdfs 'http://www.w3.org/2000/01/rdf-schema#'>\n" | ||
| 123 | "]>\n" | ||
| 124 | "<rdf:RDF " | ||
| 125 | "xmlns:rdf='&rdf;' xmlns:strigi='&strigi;' xmlns:rdfs='&rdfs;'>\n"; | ||
| 126 | |||
| 127 | const map<string, FieldProperties>& p | ||
| 128 | = FieldPropertiesDb::db().allProperties(); | ||
| 129 | map<string, FieldProperties>::const_iterator i; | ||
| 130 | for (i = p.begin(); i != p.end(); ++i) { | ||
| 131 | printRdfsProperties(out, i->second); | ||
| 132 | } | ||
| 133 | |||
| 134 | const map<string, ClassProperties>& c | ||
| 135 | = FieldPropertiesDb::db().allClasses(); | ||
| 136 | map<string, ClassProperties>::const_iterator j; | ||
| 137 | for (j = c.begin(); j != c.end(); ++j) { | ||
| 138 | printRdfsClasses(out, j->second); | ||
| 139 | } | ||
| 140 | |||
| 141 | out << "</rdf:RDF>" << endl; | ||
| 142 | } | ||
| 143 | void | ||
| 144 | printHelp(const char* program) { | ||
| 145 | cerr << "Usage: " << program << " [--type=<type>] [--locale=<locale>]" | ||
| 146 | << endl; | ||
| 147 | } | ||
| 148 | int | ||
| 149 | main(int argc, char** argv) { | ||
| 150 | struct option long_options[] = { | ||
| 151 | {"help", no_argument, 0, 0}, | ||
| 152 | {"type", required_argument, 0, 0}, | ||
| 153 | {"locale", required_argument, 0, 0} | ||
| 154 | }; | ||
| 155 | const char* type = 0; | ||
| 156 | const char* locale = 0; | ||
| 157 | bool help = false; | ||
| 158 | while (1) { | ||
| 159 | int optindex; | ||
| 160 | int c = getopt_long(argc, argv, "", long_options, &optindex); | ||
| 161 | if (c == -1) break; | ||
| 162 | if (c == 0) { | ||
| 163 | if (optindex == 0) help = true; | ||
| 164 | if (optindex == 1) type = optarg; | ||
| 165 | if (optindex == 2) locale = optarg; | ||
| 166 | } | ||
| 167 | switch (c) { | ||
| 168 | case '?': | ||
| 169 | printHelp(argv[0]); | ||
| 170 | exit(1); | ||
| 171 | default: | ||
| 172 | break; | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | // load the plugins | ||
| 177 | AnalyzerConfiguration ac; | ||
| 178 | StreamAnalyzer s(ac); | ||
| 179 | |||
| 180 | if (help) { | ||
| 181 | printHelp(argv[0]); | ||
| 182 | } else if (type && strcmp(type, "dot") == 0) { | ||
| 183 | printDot(cout, locale); | ||
| 184 | } else { | ||
| 185 | printRdfs(cout); | ||
| 186 | } | ||
| 187 | return 0; | ||
| 188 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifdef HAVE_CONFIG_H | ||
| 21 | #include <config.h> | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #include <strigi/strigiconfig.h> | ||
| 25 | //#include "compat.h" | ||
| 26 | #include <strigi/fileinputstream.h> | ||
| 27 | #include <strigi/bz2inputstream.h> | ||
| 28 | #include <strigi/diranalyzer.h> | ||
| 29 | #include <strigi/analyzerconfiguration.h> | ||
| 30 | #include <strigi/streamendanalyzer.h> | ||
| 31 | #include <strigi/streamthroughanalyzer.h> | ||
| 32 | #include <strigi/streamlineanalyzer.h> | ||
| 33 | #include <strigi/streamsaxanalyzer.h> | ||
| 34 | #include <strigi/streameventanalyzer.h> | ||
| 35 | #include "xmlindexwriter.h" | ||
| 36 | |||
| 37 | #include <cstdio> | ||
| 38 | #include <cstring> | ||
| 39 | #include <cerrno> | ||
| 40 | #include <algorithm> | ||
| 41 | #ifdef HAVE_UNISTD_H | ||
| 42 | #include <unistd.h> | ||
| 43 | #endif | ||
| 44 | #include <stdlib.h> | ||
| 45 | #ifdef HAVE_DIRECT_H | ||
| 46 | #include <direct.h> | ||
| 47 | #endif | ||
| 48 | #include <iostream> | ||
| 49 | #include <sstream> | ||
| 50 | #include <fstream> | ||
| 51 | #include <set> | ||
| 52 | using namespace Strigi; | ||
| 53 | using namespace std; | ||
| 54 | |||
| 55 | class SelectedAnalyzerConfiguration : public Strigi::AnalyzerConfiguration { | ||
| 56 | public: | ||
| 57 | const set<string> requiredAnalyzers; | ||
| 58 | set<string> obligatoryAnalyzers; | ||
| 59 | mutable set<string> usedAnalyzers; | ||
| 60 | mutable set<string> availableAnalyzers; | ||
| 61 | |||
| 62 | explicit SelectedAnalyzerConfiguration(const set<string> an) | ||
| 63 | : requiredAnalyzers(an) { | ||
| 64 | obligatoryAnalyzers.insert("EventThroughAnalyzer"); | ||
| 65 | } | ||
| 66 | |||
| 67 | bool valid() const { | ||
| 68 | return requiredAnalyzers.size() + 1 == usedAnalyzers.size() | ||
| 69 | || requiredAnalyzers.size() == 0; | ||
| 70 | } | ||
| 71 | bool useFactory(const string& name) const { | ||
| 72 | bool use = requiredAnalyzers.find(name) != requiredAnalyzers.end() | ||
| 73 | || obligatoryAnalyzers.find(name) != obligatoryAnalyzers.end() | ||
| 74 | || requiredAnalyzers.size() == 0; | ||
| 75 | if (use) { | ||
| 76 | usedAnalyzers.insert(name); | ||
| 77 | } | ||
| 78 | availableAnalyzers.insert(name); | ||
| 79 | return use; | ||
| 80 | } | ||
| 81 | bool useFactory(StreamEndAnalyzerFactory* f) const { | ||
| 82 | return useFactory(f->name()); | ||
| 83 | } | ||
| 84 | bool useFactory(StreamThroughAnalyzerFactory* f) const { | ||
| 85 | return useFactory(f->name()); | ||
| 86 | } | ||
| 87 | bool useFactory(StreamSaxAnalyzerFactory* f) const { | ||
| 88 | return useFactory(f->name()); | ||
| 89 | } | ||
| 90 | bool useFactory(StreamEventAnalyzerFactory* f) const { | ||
| 91 | return useFactory(f->name()); | ||
| 92 | } | ||
| 93 | bool useFactory(StreamLineAnalyzerFactory* f) const { | ||
| 94 | return useFactory(f->name()); | ||
| 95 | } | ||
| 96 | }; | ||
| 97 | |||
| 98 | void | ||
| 99 | printUsage(char** argv) { | ||
| 100 | fprintf(stderr, "Usage: %s [OPTIONS] SOURCE\n" | ||
| 101 | "Analyze the given file and output the result as XML.\n" | ||
| 102 | " -c configuration file\n" | ||
| 103 | " -a comma-separated list of analyzers\n" | ||
| 104 | " -r reference output, when specified, the reference output is \n" | ||
| 105 | " compared to the given output and the first difference is \n" | ||
| 106 | " reported.\n", | ||
| 107 | argv[0]); | ||
| 108 | } | ||
| 109 | bool | ||
| 110 | containsHelp(int argc, char **argv) { | ||
| 111 | for (int i=1; i<argc; ++i) { | ||
| 112 | if (strcmp(argv[i], "--help") == 0 | ||
| 113 | || strcmp(argv[i], "-h") == 0) return true; | ||
| 114 | } | ||
| 115 | return false; | ||
| 116 | } | ||
| 117 | set<string> | ||
| 118 | parseAnalyzerNames(const char* names) { | ||
| 119 | set<string> n; | ||
| 120 | string ns(names); | ||
| 121 | string::size_type start = 0, p = ns.find(','); | ||
| 122 | while (p != string::npos) { | ||
| 123 | n.insert(ns.substr(start, p-start)); | ||
| 124 | start = p + 1; | ||
| 125 | p = ns.find(',', start); | ||
| 126 | } | ||
| 127 | n.insert(ns.substr(start)); | ||
| 128 | return n; | ||
| 129 | } | ||
| 130 | set<string> | ||
| 131 | parseConfig(const char* config) { | ||
| 132 | set<string> n; | ||
| 133 | ifstream f(config); | ||
| 134 | string line; | ||
| 135 | while (f.good()) { | ||
| 136 | getline(f, line); | ||
| 137 | if (strncmp("analyzer=", line.c_str(), 9) == 0) { | ||
| 138 | n.insert(line.substr(9)); | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | return n; | ||
| 143 | } | ||
| 144 | /** | ||
| 145 | * Usage: $0 [OPTIONS] SOURCE | ||
| 146 | **/ | ||
| 147 | int | ||
| 148 | main(int argc, char** argv) { | ||
| 149 | setenv("XDG_DATA_HOME", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1); | ||
| 150 | setenv("XDG_DATA_DIRS", SOURCEDIR"/src/streamanalyzer/fieldproperties", 1); | ||
| 151 | setenv("STRIGI_PLUGIN_PATH", BINARYDIR"/src/streamanalyzer/throughplugins" | ||
| 152 | PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/lineplugins" | ||
| 153 | PATH_SEPARATOR BINARYDIR"/src/streamanalyzer/saxplugins", 1); | ||
| 154 | // there are 2 optional options that both require an argument. | ||
| 155 | // one can specify 1 source, so the number of arguments must be | ||
| 156 | // 2, 4 or 6 | ||
| 157 | if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) { | ||
| 158 | printUsage(argv); | ||
| 159 | return -1; | ||
| 160 | } | ||
| 161 | |||
| 162 | set<string> analyzers; | ||
| 163 | const char* targetFile; | ||
| 164 | const char* referenceFile = 0; | ||
| 165 | if (argc == 4) { | ||
| 166 | if (strcmp(argv[1],"-a") == 0) { | ||
| 167 | analyzers = parseAnalyzerNames(argv[2]); | ||
| 168 | } else if (strcmp(argv[1], "-r") == 0) { | ||
| 169 | referenceFile = argv[2]; | ||
| 170 | } else if (strcmp(argv[1], "-c") == 0) { | ||
| 171 | analyzers = parseConfig(argv[2]); | ||
| 172 | } else { | ||
| 173 | printUsage(argv); | ||
| 174 | return -1; | ||
| 175 | } | ||
| 176 | targetFile = argv[3]; | ||
| 177 | } else if (argc == 6) { | ||
| 178 | if (strcmp(argv[1], "-a") == 0) { | ||
| 179 | analyzers = parseAnalyzerNames(argv[2]); | ||
| 180 | if (strcmp(argv[3], "-r") == 0) { | ||
| 181 | referenceFile = argv[4]; | ||
| 182 | } | ||
| 183 | } else if (strcmp(argv[1], "-c") == 0) { | ||
| 184 | analyzers = parseConfig(argv[2]); | ||
| 185 | if (strcmp(argv[3], "-r") == 0) { | ||
| 186 | referenceFile = argv[4]; | ||
| 187 | } | ||
| 188 | } else if (strcmp(argv[1], "-r") == 0) { | ||
| 189 | referenceFile = argv[2]; | ||
| 190 | if (strcmp(argv[3], "-a") == 0) { | ||
| 191 | analyzers = parseAnalyzerNames(argv[4]); | ||
| 192 | } else if (strcmp(argv[3], "-c") == 0) { | ||
| 193 | analyzers = parseConfig(argv[4]); | ||
| 194 | } | ||
| 195 | } else { | ||
| 196 | printUsage(argv); | ||
| 197 | return -1; | ||
| 198 | } | ||
| 199 | targetFile = argv[5]; | ||
| 200 | } else { | ||
| 201 | targetFile = argv[1]; | ||
| 202 | } | ||
| 203 | |||
| 204 | const char* mappingFile = 0; | ||
| 205 | |||
| 206 | // check that the target file exists | ||
| 207 | { | ||
| 208 | ifstream filetest(targetFile); | ||
| 209 | if (!filetest.good()) { | ||
| 210 | cerr << "The file '" << targetFile << "' cannot be read." << endl; | ||
| 211 | return 1; | ||
| 212 | } | ||
| 213 | } | ||
| 214 | // check that the result file is ok | ||
| 215 | FileInputStream f(referenceFile); | ||
| 216 | if (referenceFile != 0 && f.status() != Ok) { | ||
| 217 | cerr << "The file '" << referenceFile << "' cannot be read." << endl; | ||
| 218 | return 1; | ||
| 219 | } | ||
| 220 | |||
| 221 | const TagMapping mapping(mappingFile); | ||
| 222 | ostringstream out; | ||
| 223 | out << "<?xml version='1.0' encoding='UTF-8'?>\n<" | ||
| 224 | << mapping.map("metadata"); | ||
| 225 | map<string, string>::const_iterator i = mapping.namespaces().begin(); | ||
| 226 | while (i != mapping.namespaces().end()) { | ||
| 227 | out << " xmlns:" << i->first << "='" << i->second << "'"; | ||
| 228 | i++; | ||
| 229 | } | ||
| 230 | out << ">\n"; | ||
| 231 | |||
| 232 | SelectedAnalyzerConfiguration ic(analyzers); | ||
| 233 | |||
| 234 | XmlIndexManager manager(out, mapping); | ||
| 235 | DirAnalyzer analyzer(manager, ic); | ||
| 236 | if (!ic.valid()) { | ||
| 237 | set<string>::const_iterator i; | ||
| 238 | set<string> missing; | ||
| 239 | set_difference(analyzers.begin(), analyzers.end(), | ||
| 240 | ic.availableAnalyzers.begin(), ic.availableAnalyzers.end(), | ||
| 241 | insert_iterator<set<string> >(missing, missing.begin())); | ||
| 242 | if (missing.size() == 1) { | ||
| 243 | fprintf(stderr, "No analyzer with name %s was found.\n", | ||
| 244 | missing.begin()->c_str()); | ||
| 245 | } else { | ||
| 246 | cerr << "The analyzers"; | ||
| 247 | for (i = missing.begin(); i != missing.end(); ++i) { | ||
| 248 | cerr << ", " << *i; | ||
| 249 | } | ||
| 250 | cerr << " were not found." << endl; | ||
| 251 | } | ||
| 252 | fprintf(stderr, "Choose from:\n"); | ||
| 253 | for (i = ic.availableAnalyzers.begin(); | ||
| 254 | i != ic.availableAnalyzers.end(); ++i) { | ||
| 255 | cerr << " " << *i << endl; | ||
| 256 | } | ||
| 257 | return 1; | ||
| 258 | } | ||
| 259 | |||
| 260 | // change to the directory of the file to analyze | ||
| 261 | // this ensures a consistent naming of the file uris, regardless of cwd | ||
| 262 | string targetPath(targetFile); | ||
| 263 | string::size_type slashpos = targetPath.rfind('/'); | ||
| 264 | if (slashpos == string::npos) { | ||
| 265 | analyzer.analyzeDir(targetFile); | ||
| 266 | } else { | ||
| 267 | if (chdir(targetPath.substr(0,slashpos).c_str()) == -1) { | ||
| 268 | fprintf(stderr, "%s\n", strerror(errno)); | ||
| 269 | return -1; | ||
| 270 | } | ||
| 271 | analyzer.analyzeDir(targetPath.substr(slashpos+1).c_str()); | ||
| 272 | } | ||
| 273 | string str = out.str(); | ||
| 274 | int32_t n = 2*(int32_t)str.length(); | ||
| 275 | |||
| 276 | // if no reference file was specified, we output the analysis | ||
| 277 | if (referenceFile == 0) { | ||
| 278 | cout << str; | ||
| 279 | return 0; | ||
| 280 | } | ||
| 281 | |||
| 282 | // load the file to compare with | ||
| 283 | const char* c; | ||
| 284 | n = f.read(c, n, n); | ||
| 285 | if (n < 0) { | ||
| 286 | fprintf(stderr, "Error: %s\n", f.error()); | ||
| 287 | return -1; | ||
| 288 | } | ||
| 289 | if (n != (int32_t)out.str().length()) { | ||
| 290 | cout << "output length differs " << out.str().length() << " instead of " | ||
| 291 | << n << endl; | ||
| 292 | return -1; | ||
| 293 | } | ||
| 294 | |||
| 295 | const char* p1 = c; | ||
| 296 | const char* p2 = str.c_str(); | ||
| 297 | int32_t n1 = n; | ||
| 298 | string::size_type n2 = str.length(); | ||
| 299 | while (n1-- && n2-- && *p1 == *p2) { | ||
| 300 | p1++; | ||
| 301 | p2++; | ||
| 302 | } | ||
| 303 | if (n1 ==0 && (*p1 || *p2)) { | ||
| 304 | cout << "difference at position " << p1-c << endl; | ||
| 305 | |||
| 306 | int32_t m = (80 > str.length())?(int32_t)str.length():80; | ||
| 307 | printf("%i %.*s\n", m, m, str.c_str()); | ||
| 308 | |||
| 309 | m = (80 > n)?n:80; | ||
| 310 | printf("%i %.*s\n", m, m, c); | ||
| 311 | |||
| 312 | return -1; | ||
| 313 | } | ||
| 314 | |||
| 315 | return 0; | ||
| 316 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006,2008 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifdef HAVE_CONFIG_H | ||
| 21 | #include <config.h> | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #include <strigi/strigiconfig.h> | ||
| 25 | #include <strigi/fileinputstream.h> | ||
| 26 | #include <strigi/bz2inputstream.h> | ||
| 27 | #include <strigi/diranalyzer.h> | ||
| 28 | #include <strigi/analyzerconfiguration.h> | ||
| 29 | #include "xmlindexwriter.h" | ||
| 30 | #include <strigi/streamendanalyzer.h> | ||
| 31 | #include <strigi/streamthroughanalyzer.h> | ||
| 32 | #include <strigi/streamlineanalyzer.h> | ||
| 33 | #include <strigi/streamsaxanalyzer.h> | ||
| 34 | #include <strigi/streameventanalyzer.h> | ||
| 35 | |||
| 36 | #include <cstdio> | ||
| 37 | #include <cstring> | ||
| 38 | #include <cerrno> | ||
| 39 | #include <algorithm> | ||
| 40 | #ifdef HAVE_UNISTD_H | ||
| 41 | #include <unistd.h> | ||
| 42 | #endif | ||
| 43 | #ifdef HAVE_DIRECT_H | ||
| 44 | #include <direct.h> | ||
| 45 | #endif | ||
| 46 | |||
| 47 | #include <sstream> | ||
| 48 | #include <iostream> | ||
| 49 | #include <fstream> | ||
| 50 | #include <set> | ||
| 51 | using namespace Strigi; | ||
| 52 | using namespace std; | ||
| 53 | |||
| 54 | /** | ||
| 55 | * Configure analysis bases on what fields we want to extract. | ||
| 56 | **/ | ||
| 57 | class SelectedFieldConfiguration : public Strigi::AnalyzerConfiguration { | ||
| 58 | public: | ||
| 59 | /** | ||
| 60 | * Fields the user has requested to be reported. | ||
| 61 | **/ | ||
| 62 | const set<string> requiredFields; | ||
| 63 | /** | ||
| 64 | * Fields that were requested by the user and are provided by some analyzer. | ||
| 65 | **/ | ||
| 66 | mutable set<string> usedFields; | ||
| 67 | /** | ||
| 68 | * All fields provided by all analyzers. | ||
| 69 | **/ | ||
| 70 | mutable set<string> availableFields; | ||
| 71 | |||
| 72 | explicit SelectedFieldConfiguration(const set<string> af) | ||
| 73 | : requiredFields(af) {} | ||
| 74 | |||
| 75 | /** | ||
| 76 | * The configuration is valid if all fields requested can be supplied by | ||
| 77 | * the active set of analyzer. | ||
| 78 | **/ | ||
| 79 | bool valid() const { | ||
| 80 | return requiredFields.size() == usedFields.size(); | ||
| 81 | } | ||
| 82 | /** | ||
| 83 | * If a certain field should be reported, return Stored, otherwise return | ||
| 84 | * None. | ||
| 85 | **/ | ||
| 86 | FieldType indexType(const Strigi::RegisteredField* f) const { | ||
| 87 | return (requiredFields.find(f->key()) != requiredFields.end()) | ||
| 88 | ? Stored :None; | ||
| 89 | } | ||
| 90 | /** | ||
| 91 | * If any of the fields provided by the given analyzer are requested, use | ||
| 92 | * that analyzer for the analysis. | ||
| 93 | **/ | ||
| 94 | bool useAnalyzerFactory(const StreamAnalyzerFactory* f) const { | ||
| 95 | bool use = false; | ||
| 96 | vector<const RegisteredField*>::const_iterator i; | ||
| 97 | i = f->registeredFields().begin(); | ||
| 98 | const vector<const RegisteredField*>::const_iterator end = | ||
| 99 | f->registeredFields().end(); | ||
| 100 | for (; i != end; ++i) { | ||
| 101 | string key((*i)->key()); | ||
| 102 | availableFields.insert(key); | ||
| 103 | bool usethis = requiredFields.find(key) != requiredFields.end(); | ||
| 104 | if (usethis) { | ||
| 105 | use = true; | ||
| 106 | usedFields.insert((*i)->key()); | ||
| 107 | } | ||
| 108 | } | ||
| 109 | return use; | ||
| 110 | } | ||
| 111 | bool useFactory(StreamEndAnalyzerFactory* f) const { | ||
| 112 | return useAnalyzerFactory(f); | ||
| 113 | } | ||
| 114 | bool useFactory(StreamThroughAnalyzerFactory* f) const { | ||
| 115 | return useAnalyzerFactory(f); | ||
| 116 | } | ||
| 117 | bool useFactory(StreamSaxAnalyzerFactory* f) const { | ||
| 118 | return useAnalyzerFactory(f); | ||
| 119 | } | ||
| 120 | bool useFactory(StreamEventAnalyzerFactory* f) const { | ||
| 121 | return useAnalyzerFactory(f); | ||
| 122 | } | ||
| 123 | bool useFactory(StreamLineAnalyzerFactory* f) const { | ||
| 124 | return useAnalyzerFactory(f); | ||
| 125 | } | ||
| 126 | }; | ||
| 127 | |||
| 128 | void | ||
| 129 | printUsage(char** argv) { | ||
| 130 | fprintf(stderr, "Usage: %s analyzer file-to-analyze referenceoutputfile\n", | ||
| 131 | argv[0]); | ||
| 132 | } | ||
| 133 | bool | ||
| 134 | containsHelp(int argc, char **argv) { | ||
| 135 | for (int i=1; i<argc; ++i) { | ||
| 136 | if (strcmp(argv[i], "--help") == 0 | ||
| 137 | || strcmp(argv[i], "-h") == 0) return true; | ||
| 138 | } | ||
| 139 | return false; | ||
| 140 | } | ||
| 141 | set<string> | ||
| 142 | parseFieldNames(const char* names) { | ||
| 143 | set<string> n; | ||
| 144 | string ns(names); | ||
| 145 | string::size_type start = 0, p = ns.find(','); | ||
| 146 | while (p != string::npos) { | ||
| 147 | n.insert(ns.substr(start, p-start)); | ||
| 148 | start = p + 1; | ||
| 149 | p = ns.find(',', start); | ||
| 150 | } | ||
| 151 | n.insert(ns.substr(start)); | ||
| 152 | return n; | ||
| 153 | } | ||
| 154 | /** | ||
| 155 | * Usage: $0 [OPTIONS] SOURCE | ||
| 156 | **/ | ||
| 157 | int | ||
| 158 | main(int argc, char** argv) { | ||
| 159 | // there are 2 optional options that both require an argument. | ||
| 160 | // one can specify 1 source, so the number of arguments must be | ||
| 161 | // 2, 4 or 6 | ||
| 162 | if (containsHelp(argc, argv) || (argc != 2 && argc != 4 && argc != 6)) { | ||
| 163 | printUsage(argv); | ||
| 164 | return -1; | ||
| 165 | } | ||
| 166 | |||
| 167 | set<string> analyzers; | ||
| 168 | const char* targetFile; | ||
| 169 | const char* referenceFile = 0; | ||
| 170 | if (argc == 4) { | ||
| 171 | if (strcmp(argv[1],"-f") == 0) { | ||
| 172 | analyzers = parseFieldNames(argv[2]); | ||
| 173 | } else if (strcmp(argv[1], "-r") == 0) { | ||
| 174 | referenceFile = argv[2]; | ||
| 175 | } else { | ||
| 176 | printUsage(argv); | ||
| 177 | return -1; | ||
| 178 | } | ||
| 179 | targetFile = argv[3]; | ||
| 180 | } else if (argc == 6) { | ||
| 181 | if (strcmp(argv[1], "-f") == 0) { | ||
| 182 | analyzers = parseFieldNames(argv[2]); | ||
| 183 | if (strcmp(argv[3], "-r") == 0) { | ||
| 184 | referenceFile = argv[4]; | ||
| 185 | } | ||
| 186 | } else if (strcmp(argv[1], "-r") == 0) { | ||
| 187 | referenceFile = argv[2]; | ||
| 188 | if (strcmp(argv[3], "-f") == 0) { | ||
| 189 | analyzers = parseFieldNames(argv[4]); | ||
| 190 | } | ||
| 191 | } else { | ||
| 192 | printUsage(argv); | ||
| 193 | return -1; | ||
| 194 | } | ||
| 195 | targetFile = argv[5]; | ||
| 196 | } else { | ||
| 197 | targetFile = argv[1]; | ||
| 198 | } | ||
| 199 | |||
| 200 | const char* mappingFile = 0; | ||
| 201 | |||
| 202 | // check that the target file exists | ||
| 203 | { | ||
| 204 | ifstream filetest(targetFile); | ||
| 205 | if (!filetest.good()) { | ||
| 206 | cerr << "The file '" << targetFile << "' cannot be read." << endl; | ||
| 207 | return 1; | ||
| 208 | } | ||
| 209 | } | ||
| 210 | |||
| 211 | const TagMapping mapping(mappingFile); | ||
| 212 | ostringstream out; | ||
| 213 | out << "<?xml version='1.0' encoding='UTF-8'?>\n<" | ||
| 214 | << mapping.map("metadata"); | ||
| 215 | map<string, string>::const_iterator i = mapping.namespaces().begin(); | ||
| 216 | while (i != mapping.namespaces().end()) { | ||
| 217 | out << " xmlns:" << i->first << "='" << i->second << "'"; | ||
| 218 | i++; | ||
| 219 | } | ||
| 220 | out << ">\n"; | ||
| 221 | |||
| 222 | ostringstream s; | ||
| 223 | SelectedFieldConfiguration ic(analyzers); | ||
| 224 | XmlIndexManager manager(s, mapping); | ||
| 225 | DirAnalyzer analyzer(manager, ic); | ||
| 226 | if (!ic.valid()) { | ||
| 227 | set<string>::const_iterator i; | ||
| 228 | set<string> missing; | ||
| 229 | set_difference(analyzers.begin(), analyzers.end(), | ||
| 230 | ic.availableFields.begin(), ic.availableFields.end(), | ||
| 231 | insert_iterator<set<string> >(missing, missing.begin())); | ||
| 232 | if (missing.size() == 1) { | ||
| 233 | fprintf(stderr, "No field with name %s was found.\n", | ||
| 234 | missing.begin()->c_str()); | ||
| 235 | } else { | ||
| 236 | cerr << "The fields "; | ||
| 237 | for (i = missing.begin(); i != missing.end(); ++i) { | ||
| 238 | cerr << ", " << *i; | ||
| 239 | } | ||
| 240 | cerr << " were not found." << endl; | ||
| 241 | } | ||
| 242 | fprintf(stderr, "Choose from:\n"); | ||
| 243 | for (i = ic.availableFields.begin(); i != ic.availableFields.end(); ++i) { | ||
| 244 | cerr << " " << *i << endl; | ||
| 245 | } | ||
| 246 | return 1; | ||
| 247 | } | ||
| 248 | if (chdir(argv[1]) == -1) { | ||
| 249 | fprintf(stderr, "%s\n", strerror(errno)); | ||
| 250 | return -1; | ||
| 251 | } | ||
| 252 | analyzer.analyzeDir(targetFile); | ||
| 253 | string str = s.str(); | ||
| 254 | int32_t n = 2*(int32_t)str.length(); | ||
| 255 | |||
| 256 | // if no reference file was specified, we output the analysis | ||
| 257 | if (referenceFile == 0) { | ||
| 258 | cout << str; | ||
| 259 | return 0; | ||
| 260 | } | ||
| 261 | |||
| 262 | // load the file to compare with | ||
| 263 | FileInputStream f(referenceFile); | ||
| 264 | BZ2InputStream bz2(&f); | ||
| 265 | const char* c; | ||
| 266 | n = bz2.read(c, n, n); | ||
| 267 | if (n < 0) { | ||
| 268 | fprintf(stderr, "Error: %s\n", bz2.error()); | ||
| 269 | return -1; | ||
| 270 | } | ||
| 271 | if (n != (int32_t)s.str().length()) { | ||
| 272 | cerr << "output length differs " << n << " instead of " | ||
| 273 | << s.str().length() << endl; | ||
| 274 | } | ||
| 275 | |||
| 276 | const char* p1 = c; | ||
| 277 | const char* p2 = str.c_str(); | ||
| 278 | int32_t n1 = n; | ||
| 279 | string::size_type n2 = str.length(); | ||
| 280 | while (n1-- && n2-- && *p1 == *p2) { | ||
| 281 | p1++; | ||
| 282 | p2++; | ||
| 283 | } | ||
| 284 | if (n1 ==0 && (*p1 || *p2)) { | ||
| 285 | cerr << "difference at position " << p1-c << endl; | ||
| 286 | |||
| 287 | int32_t m = (80 > str.length())?(int32_t)str.length():80; | ||
| 288 | printf("%i %.*s\n", m, m, str.c_str()); | ||
| 289 | |||
| 290 | m = (80 > n)?n:80; | ||
| 291 | printf("%i %.*s\n", m, m, c); | ||
| 292 | |||
| 293 | return -1; | ||
| 294 | } | ||
| 295 | |||
| 296 | return 0; | ||
| 297 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifdef HAVE_CONFIG_H | ||
| 21 | #include <config.h> | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #include <strigi/strigiconfig.h> | ||
| 25 | #include "rdfindexwriter.h" | ||
| 26 | #include <strigi/analyzerconfiguration.h> | ||
| 27 | #include <strigi/diranalyzer.h> | ||
| 28 | #include <strigi/fileinputstream.h> | ||
| 29 | #include <iostream> | ||
| 30 | #include <cstring> | ||
| 31 | #ifdef HAVE_UNISTD_H | ||
| 32 | #include <unistd.h> | ||
| 33 | #endif | ||
| 34 | #ifdef HAVE_DIRECT_H | ||
| 35 | #include <direct.h> | ||
| 36 | #endif | ||
| 37 | #include <stdlib.h> | ||
| 38 | #include <time.h> | ||
| 39 | |||
| 40 | using namespace std; | ||
| 41 | using namespace Strigi; | ||
| 42 | |||
| 43 | int | ||
| 44 | usage(int /*argc*/, char** argv) { | ||
| 45 | fprintf(stderr, "Usage: %s\n [--mappingfile <mappingfile>]\n" | ||
| 46 | " [--lastfiletoskip FILE]\n" | ||
| 47 | " [--stdinmtime mtime]\n [--stdinfilename filename]\n" | ||
| 48 | " [dirs-or-files-to-index]\n" | ||
| 49 | " [-j nthreads]\n", | ||
| 50 | argv[0]); | ||
| 51 | return -1; | ||
| 52 | } | ||
| 53 | bool | ||
| 54 | containsHelp(int argc, char **argv) { | ||
| 55 | for (int i=1; i<argc; ++i) { | ||
| 56 | if (strcmp(argv[i], "--help") == 0 | ||
| 57 | || strcmp(argv[i], "-h") == 0) return true; | ||
| 58 | } | ||
| 59 | return false; | ||
| 60 | } | ||
| 61 | void | ||
| 62 | analyzeFromStdin(RdfIndexManager& manager, AnalyzerConfiguration& ac, | ||
| 63 | const string& filename, time_t mtime) { | ||
| 64 | StreamAnalyzer sa(ac); | ||
| 65 | sa.setIndexWriter(*manager.indexWriter()); | ||
| 66 | FileInputStream in(stdin, filename.c_str()); | ||
| 67 | AnalysisResult result(filename, mtime, *manager.indexWriter(), sa); | ||
| 68 | sa.analyze(result, &in); | ||
| 69 | } | ||
| 70 | |||
| 71 | int | ||
| 72 | main(int argc, char **argv) { | ||
| 73 | vector<string> dirs; | ||
| 74 | int nthreads = 2; | ||
| 75 | const char* mappingfile = 0; | ||
| 76 | string lastFileToSkip; | ||
| 77 | time_t stdinMTime = time(0); | ||
| 78 | string stdinFilename = "-"; | ||
| 79 | int i = 0; | ||
| 80 | while (++i < argc) { | ||
| 81 | const char* arg = argv[i]; | ||
| 82 | if (!strcmp("-h", arg) || !strcmp("--help", arg)) { | ||
| 83 | return usage(argc, argv); | ||
| 84 | } | ||
| 85 | if (!strcmp("-j", arg)) { | ||
| 86 | if (++i == argc) { | ||
| 87 | return usage(argc, argv); | ||
| 88 | } | ||
| 89 | char* end; | ||
| 90 | nthreads = (int)strtol(argv[i], &end, 10); | ||
| 91 | if (end == argv[i] || nthreads < 1) { | ||
| 92 | return usage(argc, argv); | ||
| 93 | } | ||
| 94 | } else if (!strcmp("--mappingfile", arg)) { | ||
| 95 | if (++i == argc) { | ||
| 96 | return usage(argc, argv); | ||
| 97 | } | ||
| 98 | mappingfile = argv[i]; | ||
| 99 | } else if (!strcmp("--lastfiletoskip", arg)) { | ||
| 100 | if (++i == argc) { | ||
| 101 | return usage(argc, argv); | ||
| 102 | } | ||
| 103 | lastFileToSkip = argv[i]; | ||
| 104 | } else if (!strcmp("--stdinmtime", arg)) { | ||
| 105 | if (++i == argc) { | ||
| 106 | return usage(argc, argv); | ||
| 107 | } | ||
| 108 | char* end; | ||
| 109 | stdinMTime = strtol(argv[i], &end, 10); | ||
| 110 | if (end == argv[i] || stdinMTime < 1) { | ||
| 111 | return usage(argc, argv); | ||
| 112 | } | ||
| 113 | } else if (!strcmp("--stdinfilename", arg)) { | ||
| 114 | if (++i == argc) { | ||
| 115 | return usage(argc, argv); | ||
| 116 | } | ||
| 117 | stdinFilename = argv[i]; | ||
| 118 | } else { | ||
| 119 | const char* dir = argv[i]; | ||
| 120 | // remove trailing '/' | ||
| 121 | size_t len = strlen(dir); | ||
| 122 | if (dir[len-1] == '/') { | ||
| 123 | dirs.push_back(std::string(dir, len-1)); | ||
| 124 | } else { | ||
| 125 | dirs.push_back(dir); | ||
| 126 | } | ||
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | if (dirs.size() == 0) { | ||
| 131 | char buf[1024]; | ||
| 132 | if (getcwd(buf, 1023) == NULL) { | ||
| 133 | return -1; | ||
| 134 | } | ||
| 135 | dirs.push_back(buf); | ||
| 136 | } | ||
| 137 | |||
| 138 | vector<pair<bool,string> >filters; | ||
| 139 | filters.push_back(make_pair<bool,string>(false,".*/")); | ||
| 140 | filters.push_back(make_pair<bool,string>(false,".*")); | ||
| 141 | AnalyzerConfiguration ic; | ||
| 142 | ic.setFilters(filters); | ||
| 143 | |||
| 144 | const TagMapping mapping(mappingfile); | ||
| 145 | /* cout << "<?xml version='1.0' encoding='UTF-8'?>\n<" | ||
| 146 | << mapping.map("metadata"); | ||
| 147 | map<string, string>::const_iterator k = mapping.namespaces().begin(); | ||
| 148 | while (k != mapping.namespaces().end()) { | ||
| 149 | cout << " xmlns:" << k->first << "='" << k->second << "'"; | ||
| 150 | k++; | ||
| 151 | } | ||
| 152 | cout << ">\n"; | ||
| 153 | */ | ||
| 154 | rdfset rdf; | ||
| 155 | |||
| 156 | RdfIndexManager manager(cout, mapping, rdf); | ||
| 157 | DirAnalyzer analyzer(manager, ic); | ||
| 158 | for (unsigned i = 0; i < dirs.size(); ++i) { | ||
| 159 | if (dirs[i] == "-") { | ||
| 160 | analyzeFromStdin(manager, ic, stdinFilename, stdinMTime); | ||
| 161 | } else { | ||
| 162 | analyzer.analyzeDir(dirs[i], nthreads, 0, lastFileToSkip); | ||
| 163 | } | ||
| 164 | } | ||
| 165 | // cout << "</" << mapping.map("metadata") << ">\n"; | ||
| 166 | |||
| 167 | |||
| 168 | for(rdfset::const_iterator subj = rdf.begin(); subj != rdf.end(); subj++) { | ||
| 169 | cout<< "<" << subj->first << ">"; | ||
| 170 | |||
| 171 | std::map<std::string, std::list<std::string> >::const_iterator pred = subj->second.begin(); | ||
| 172 | do { | ||
| 173 | cout << "\n\t<" << pred->first << "> "; | ||
| 174 | |||
| 175 | std::list<std::string>::const_iterator obj = pred->second.begin(); | ||
| 176 | do { | ||
| 177 | cout << "\n\t\t\"" << *obj << "\""; | ||
| 178 | obj++; | ||
| 179 | if(obj != pred->second.end()) | ||
| 180 | cout <<","; | ||
| 181 | } while (obj != pred->second.end()); | ||
| 182 | pred++; | ||
| 183 | if(pred!=subj->second.end()) | ||
| 184 | cout << ";"; | ||
| 185 | } while(pred!=subj->second.end()); | ||
| 186 | cout<< ".\n"; | ||
| 187 | } | ||
| 188 | |||
| 189 | |||
| 190 | return 0; | ||
| 191 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "rdfindexwriter.h" | ||
| 21 | using namespace std; | ||
| 22 | using namespace Strigi; | ||
| 23 | |||
| 24 | void | ||
| 25 | RdfIndexWriter::initWriterData(const FieldRegister& f) { | ||
| 26 | map<string, RegisteredField*>::const_iterator i; | ||
| 27 | map<string, RegisteredField*>::const_iterator end(f.fields().end()); | ||
| 28 | for (i = f.fields().begin(); i != end; ++i) { | ||
| 29 | Tag* tag = static_cast<Tag*>(i->second->writerData()); | ||
| 30 | if (tag) { | ||
| 31 | tag->refcount++; | ||
| 32 | continue; | ||
| 33 | } | ||
| 34 | tag = new Tag(); | ||
| 35 | tag->refcount = 1; | ||
| 36 | const string s(i->first); | ||
| 37 | const string& n = mapping.map(s); | ||
| 38 | if (s == n) { | ||
| 39 | tag->open = " <value name='" + n + "'>"; | ||
| 40 | tag->close = "</value>\n"; | ||
| 41 | } else { | ||
| 42 | tag->open = " <" + n + '>'; | ||
| 43 | tag->close = "</" + n + ">\n"; | ||
| 44 | } | ||
| 45 | i->second->setWriterData(tag); | ||
| 46 | } | ||
| 47 | } | ||
| 48 | void | ||
| 49 | RdfIndexWriter::releaseWriterData(const FieldRegister& f) { | ||
| 50 | map<string, RegisteredField*>::const_iterator i; | ||
| 51 | map<string, RegisteredField*>::const_iterator end(f.fields().end()); | ||
| 52 | for (i = f.fields().begin(); i != end; ++i) { | ||
| 53 | Tag* tag = static_cast<Tag*>(i->second->writerData()); | ||
| 54 | if (tag->refcount-- == 1) { | ||
| 55 | //fprintf(stderr, "free for %s\n", i->second->key().c_str()); | ||
| 56 | delete tag; | ||
| 57 | i->second->setWriterData(0); | ||
| 58 | } | ||
| 59 | } | ||
| 60 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef RDFINDEXWRITER_H | ||
| 21 | #define RDFINDEXWRITER_H | ||
| 22 | |||
| 23 | #include <strigi/indexwriter.h> | ||
| 24 | #include <strigi/indexmanager.h> | ||
| 25 | #include <strigi/analysisresult.h> | ||
| 26 | #include "tagmapping.h" | ||
| 27 | #include <strigi/fieldtypes.h> | ||
| 28 | #include <strigi/analyzerconfiguration.h> | ||
| 29 | #include <strigi/strigi_thread.h> | ||
| 30 | #include <iostream> | ||
| 31 | #include <sstream> | ||
| 32 | #include <map> | ||
| 33 | #include <list> | ||
| 34 | |||
| 35 | typedef std::map<std::string, std::map<std::string, std::list<std::string> > > rdfset; | ||
| 36 | |||
| 37 | class RdfIndexWriter : public Strigi::IndexWriter { | ||
| 38 | private: | ||
| 39 | struct Data { | ||
| 40 | std::multimap<const Strigi::RegisteredField*, std::string> values; | ||
| 41 | std::string text; | ||
| 42 | }; | ||
| 43 | std::map<STRIGI_THREAD_TYPE, std::vector<Data*> > data; | ||
| 44 | struct Tag { | ||
| 45 | std::string open; | ||
| 46 | std::string close; | ||
| 47 | int refcount; | ||
| 48 | }; | ||
| 49 | |||
| 50 | STRIGI_MUTEX_DEFINE(mutex); | ||
| 51 | std::ostream& out; | ||
| 52 | |||
| 53 | rdfset& rdf; | ||
| 54 | |||
| 55 | const TagMapping& mapping; | ||
| 56 | |||
| 57 | void printText(const std::string& text) { | ||
| 58 | const char* p = text.c_str(); | ||
| 59 | const char* end = p + text.size(); | ||
| 60 | char nb = 0; | ||
| 61 | bool lastwhite = true; | ||
| 62 | while (p < end) { | ||
| 63 | char c = *p; | ||
| 64 | if (nb) { | ||
| 65 | if ((0xC0 & c) != 0x80) { | ||
| 66 | return; | ||
| 67 | } | ||
| 68 | out.put(c); | ||
| 69 | nb--; | ||
| 70 | } else if ((0xE0 & c) == 0xC0) { | ||
| 71 | nb = 1; | ||
| 72 | out.put(c); | ||
| 73 | } else if ((0xF0 & c) == 0xE0) { | ||
| 74 | nb = 2; | ||
| 75 | out.put(c); | ||
| 76 | } else if ((0xF8 & c) == 0xF0) { | ||
| 77 | nb = 3; | ||
| 78 | out.put(c); | ||
| 79 | } else if (c <= 8) { | ||
| 80 | return; | ||
| 81 | } else if (c == '&') { | ||
| 82 | out << "&"; | ||
| 83 | } else if (c == '<') { | ||
| 84 | out << "<"; | ||
| 85 | } else if (c == '>') { | ||
| 86 | out << ">"; | ||
| 87 | } else if (isspace(c) != 0) { | ||
| 88 | // we've to handle dos formatting | ||
| 89 | //'\r' char is ignored, it isn't wroten to out and doesn't | ||
| 90 | //change lastwhite value (so the following '\n' will be handled) | ||
| 91 | if (!lastwhite && (c!= '\r')) { | ||
| 92 | out.put(c); | ||
| 93 | lastwhite = true; | ||
| 94 | } | ||
| 95 | } else { | ||
| 96 | lastwhite = false; | ||
| 97 | out.put(c); | ||
| 98 | } | ||
| 99 | p++; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | static void escape(std::string& value) { | ||
| 103 | int namp, nlt, ngt, napos, nexcept; | ||
| 104 | namp = nlt = ngt = napos = nexcept = 0; | ||
| 105 | const char* p = value.c_str(); | ||
| 106 | const char* end = p + value.size(); | ||
| 107 | char nb = 0; | ||
| 108 | while (p < end) { | ||
| 109 | char c = *p; | ||
| 110 | if (nb) { | ||
| 111 | if ((0xC0 & c) != 0x80) { | ||
| 112 | value = ""; | ||
| 113 | return; | ||
| 114 | } | ||
| 115 | nb--; | ||
| 116 | } else if ((0xE0 & c) == 0xC0) { | ||
| 117 | nb = 1; | ||
| 118 | } else if ((0xF0 & c) == 0xE0) { | ||
| 119 | nb = 2; | ||
| 120 | } else if ((0xF8 & c) == 0xF0) { | ||
| 121 | nb = 3; | ||
| 122 | } else if (c < 32 && c != 9 && c != 10 && c != 12) { | ||
| 123 | nexcept++; | ||
| 124 | } else if (c == '&') { | ||
| 125 | namp++; | ||
| 126 | } else if (c == '<') { | ||
| 127 | nlt++; | ||
| 128 | } else if (c == '>') { | ||
| 129 | ngt++; | ||
| 130 | } else if (c == '\'') { | ||
| 131 | napos++; | ||
| 132 | } | ||
| 133 | p++; | ||
| 134 | } | ||
| 135 | // if no character has to be escaped, just return | ||
| 136 | if (!(namp||nlt||ngt|napos|nexcept)) { | ||
| 137 | return; | ||
| 138 | } | ||
| 139 | |||
| 140 | std::string ov(value); | ||
| 141 | p = ov.c_str(); | ||
| 142 | end = p + ov.size(); | ||
| 143 | int newsize = (int)value.size()+4*namp+3*(nlt+ngt)+5*napos+3*nexcept; | ||
| 144 | value.clear(); | ||
| 145 | value.reserve(newsize); | ||
| 146 | while (p < end) { | ||
| 147 | char c = *p; | ||
| 148 | if (nb) { | ||
| 149 | if ((0xC0 & c) != 0x80) { | ||
| 150 | value = ""; | ||
| 151 | return; | ||
| 152 | } | ||
| 153 | nb--; | ||
| 154 | value += c; | ||
| 155 | } else if ((0xE0 & c) == 0xC0) { | ||
| 156 | nb = 1; | ||
| 157 | value += c; | ||
| 158 | } else if ((0xF0 & c) == 0xE0) { | ||
| 159 | nb = 2; | ||
| 160 | value += c; | ||
| 161 | } else if ((0xF8 & c) == 0xF0) { | ||
| 162 | nb = 3; | ||
| 163 | value += c; | ||
| 164 | } else if (c < 32 && c != 9 && c != 10 && c != 12) { | ||
| 165 | char s[4]; | ||
| 166 | snprintf(s, 4, "%%%2x", (unsigned char)c); | ||
| 167 | value += s; | ||
| 168 | } else if (c == '&') { | ||
| 169 | value += "&"; | ||
| 170 | } else if (c == '<') { | ||
| 171 | value += "<"; | ||
| 172 | } else if (c == '>') { | ||
| 173 | value += ">"; | ||
| 174 | } else if (c == '\'') { | ||
| 175 | value += "'"; | ||
| 176 | } else { | ||
| 177 | value += c; | ||
| 178 | } | ||
| 179 | p++; | ||
| 180 | } | ||
| 181 | } | ||
| 182 | protected: | ||
| 183 | void startAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 184 | STRIGI_MUTEX_LOCK(&mutex); | ||
| 185 | std::vector<Data*>& dv = data[STRIGI_THREAD_SELF()]; | ||
| 186 | STRIGI_MUTEX_UNLOCK(&mutex); | ||
| 187 | unsigned char depth = ar->depth(); | ||
| 188 | if (depth >= dv.size()) { | ||
| 189 | dv.push_back(new Data()); | ||
| 190 | } | ||
| 191 | Data* d = dv[depth]; | ||
| 192 | ar->setWriterData(d); | ||
| 193 | } | ||
| 194 | void printValue(const Strigi::AnalyzerConfiguration& config, | ||
| 195 | const Strigi::RegisteredField* name, std::string& value) { | ||
| 196 | if (config.indexType(name) != Strigi::AnalyzerConfiguration::None) { | ||
| 197 | const Tag* tag = static_cast<const Tag*>(name->writerData()); | ||
| 198 | escape(value); | ||
| 199 | out << tag->open << value << tag->close; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | void finishAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 203 | STRIGI_MUTEX_LOCK(&mutex); | ||
| 204 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 205 | const Strigi::AnalyzerConfiguration& config = ar->config(); | ||
| 206 | //const Strigi::FieldRegister& fr = config.fieldRegister(); | ||
| 207 | std::string v = ar->path(); | ||
| 208 | escape(v); | ||
| 209 | /* out << " <" << mapping.map("file") << " " << mapping.map("uri") | ||
| 210 | << "='" << v << "' " << mapping.map("mtime") << "='" | ||
| 211 | << (int)ar->mTime() | ||
| 212 | << "'>\n"; | ||
| 213 | |||
| 214 | if (ar->encoding().size()) { | ||
| 215 | v.assign(ar->encoding()); | ||
| 216 | printValue(config, fr.encodingField, v); | ||
| 217 | } | ||
| 218 | |||
| 219 | std::multimap<const Strigi::RegisteredField*, std::string>::iterator | ||
| 220 | i, end; | ||
| 221 | end = d->values.end(); | ||
| 222 | for (i = d->values.begin(); i != end; ++i) { | ||
| 223 | printValue(config, i->first, i->second); | ||
| 224 | } | ||
| 225 | std::ostringstream oss; | ||
| 226 | oss << (int)ar->depth(); | ||
| 227 | v = oss.str(); | ||
| 228 | printValue(config, fr.embeddepthField, v); | ||
| 229 | if (d->text.size() > 0) { | ||
| 230 | out << " <text>"; | ||
| 231 | printText(d->text); | ||
| 232 | out << "</text>\n"; | ||
| 233 | } | ||
| 234 | out << " </" << mapping.map("file") << ">\n"; | ||
| 235 | */ | ||
| 236 | STRIGI_MUTEX_UNLOCK(&mutex); | ||
| 237 | |||
| 238 | std::string subj = d->values.find(config.fieldRegister().pathField)->second; | ||
| 239 | |||
| 240 | for (std::multimap<const Strigi::RegisteredField*, std::string>::iterator i = d->values.begin(); | ||
| 241 | i != d->values.end(); i++) { | ||
| 242 | addTriplet(subj, i->first->key(), i->second); | ||
| 243 | } | ||
| 244 | if (!d->text.empty()) | ||
| 245 | addTriplet(subj,"http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent",d->text); | ||
| 246 | |||
| 247 | d->values.clear(); | ||
| 248 | d->text.assign(""); | ||
| 249 | } | ||
| 250 | void addText(const Strigi::AnalysisResult* ar, const char* text, | ||
| 251 | int32_t length) { | ||
| 252 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 253 | if (d->text.size() < 10000000) { | ||
| 254 | d->text.append(text, length); | ||
| 255 | d->text.append("\n"); | ||
| 256 | } | ||
| 257 | } | ||
| 258 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 259 | const Strigi::RegisteredField* field, const std::string& value) { | ||
| 260 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 261 | d->values.insert( | ||
| 262 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 263 | field, value)); | ||
| 264 | } | ||
| 265 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 266 | const Strigi::RegisteredField* field, | ||
| 267 | const unsigned char* data, uint32_t size) { | ||
| 268 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 269 | d->values.insert( | ||
| 270 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 271 | field, std::string((const char*)data, size))); | ||
| 272 | } | ||
| 273 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 274 | const Strigi::RegisteredField* field, uint32_t value) { | ||
| 275 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 276 | static std::ostringstream v; | ||
| 277 | v.str(""); | ||
| 278 | v << value; | ||
| 279 | d->values.insert( | ||
| 280 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 281 | field, v.str())); | ||
| 282 | } | ||
| 283 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 284 | const Strigi::RegisteredField* field, int32_t value) { | ||
| 285 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 286 | static std::ostringstream v; | ||
| 287 | v.str(""); | ||
| 288 | v << value; | ||
| 289 | d->values.insert( | ||
| 290 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 291 | field, v.str())); | ||
| 292 | } | ||
| 293 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 294 | const Strigi::RegisteredField* field, double value) { | ||
| 295 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 296 | static std::ostringstream v; | ||
| 297 | v.str(""); | ||
| 298 | v << value; | ||
| 299 | d->values.insert( | ||
| 300 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 301 | field, v.str())); | ||
| 302 | } | ||
| 303 | void addTriplet(const std::string& subject, | ||
| 304 | const std::string& predicate, const std::string& object) { | ||
| 305 | STRIGI_MUTEX_LOCK(&mutex); | ||
| 306 | rdf[subject][predicate].push_back(object); | ||
| 307 | STRIGI_MUTEX_UNLOCK(&mutex); | ||
| 308 | } | ||
| 309 | void addValue(const Strigi::AnalysisResult*, | ||
| 310 | const Strigi::RegisteredField* field, const std::string& name, | ||
| 311 | const std::string& value) {} | ||
| 312 | void initWriterData(const Strigi::FieldRegister&); | ||
| 313 | void releaseWriterData(const Strigi::FieldRegister&); | ||
| 314 | public: | ||
| 315 | explicit RdfIndexWriter(std::ostream& o, const TagMapping& m, rdfset& r) | ||
| 316 | :out(o), rdf(r), mapping(m) { | ||
| 317 | STRIGI_MUTEX_INIT(&mutex); | ||
| 318 | } | ||
| 319 | ~RdfIndexWriter() { | ||
| 320 | std::map<STRIGI_THREAD_TYPE, std::vector<Data*> >::const_iterator j; | ||
| 321 | for (j = data.begin(); j != data.end(); ++j) { | ||
| 322 | std::vector<Data*>::const_iterator i; | ||
| 323 | for (i = j->second.begin(); i != j->second.end(); ++i) { | ||
| 324 | delete *i; | ||
| 325 | } | ||
| 326 | } | ||
| 327 | STRIGI_MUTEX_DESTROY(&mutex); | ||
| 328 | } | ||
| 329 | void commit() {} | ||
| 330 | void deleteEntries(const std::vector<std::string>& entries) {} | ||
| 331 | void deleteAllEntries() {} | ||
| 332 | }; | ||
| 333 | |||
| 334 | class RdfIndexManager : public Strigi::IndexManager { | ||
| 335 | private: | ||
| 336 | RdfIndexWriter writer; | ||
| 337 | public: | ||
| 338 | RdfIndexManager(std::ostream& o, const TagMapping& m, rdfset& r) :writer(o, m, r) {} | ||
| 339 | Strigi::IndexWriter* indexWriter() { | ||
| 340 | return &writer; | ||
| 341 | } | ||
| 342 | Strigi::IndexReader* indexReader() { | ||
| 343 | return 0; | ||
| 344 | } | ||
| 345 | }; | ||
| 346 | |||
| 347 | #endif |
|   | |||
| 1 | rdf:http://www.w3.org/1999/02/22-rdf-syntax-ns# | ||
| 2 | dc:http://purl.org/dc/elements/1.1/ | ||
| 3 | audio:eh?missing? | ||
| 4 | metadata rdf:RDF | ||
| 5 | file rdf:Description | ||
| 6 | uri rdf:about | ||
| 7 | title audio:title | ||
| 8 | artist audio:artist | ||
| 9 | album audio:album |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "tagmapping.h" | ||
| 21 | #include <iostream> | ||
| 22 | #include <fstream> | ||
| 23 | using namespace std; | ||
| 24 | |||
| 25 | TagMapping::TagMapping(const char* path) { | ||
| 26 | if (path == 0) return; | ||
| 27 | ifstream file(path); | ||
| 28 | string line; | ||
| 29 | for (;;) { | ||
| 30 | getline(file, line); | ||
| 31 | if (!file.good()) { | ||
| 32 | break; | ||
| 33 | } | ||
| 34 | string::size_type p = line.find('\t'); | ||
| 35 | if (p != string::npos) { | ||
| 36 | mapping[line.substr(0, p)] = line.substr(p+1); | ||
| 37 | } else { | ||
| 38 | p = line.find(':'); | ||
| 39 | if (p != string::npos) { | ||
| 40 | m_namespaces[line.substr(0, p)] = line.substr(p+1); | ||
| 41 | } | ||
| 42 | } | ||
| 43 | } | ||
| 44 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef TAGMAPPING_H | ||
| 21 | #define TAGMAPPPING_H | ||
| 22 | |||
| 23 | #include <map> | ||
| 24 | #include <string> | ||
| 25 | |||
| 26 | class TagMapping { | ||
| 27 | private: | ||
| 28 | std::map<std::string, std::string> m_namespaces; | ||
| 29 | std::map<std::string, std::string> mapping; | ||
| 30 | public: | ||
| 31 | TagMapping(const char* mappingfile); | ||
| 32 | const std::map<std::string, std::string>& namespaces() const { | ||
| 33 | return m_namespaces; | ||
| 34 | } | ||
| 35 | const std::string& map(const std::string& key) const { | ||
| 36 | std::map<std::string, std::string>::const_iterator i | ||
| 37 | = mapping.find(key); | ||
| 38 | return (i == mapping.end()) ?key :i->second; | ||
| 39 | } | ||
| 40 | }; | ||
| 41 | |||
| 42 | #endif |
|   | |||
| 1 | #! /usr/bin/python | ||
| 2 | import sys | ||
| 3 | import time | ||
| 4 | from xml.sax import make_parser, handler, SAXException | ||
| 5 | |||
| 6 | class UriLogger(handler.ContentHandler): | ||
| 7 | |||
| 8 | def __init__(self): | ||
| 9 | self.count = 0 | ||
| 10 | self.start = 0 | ||
| 11 | |||
| 12 | def startElement(self, name, attrs): | ||
| 13 | if attrs.has_key('uri'): | ||
| 14 | if (self.start == 0): | ||
| 15 | self.start = time.time() | ||
| 16 | self.uri = attrs['uri'] | ||
| 17 | self.count += 1 | ||
| 18 | if (self.count % 1000 == 0): | ||
| 19 | elapsed = time.time() - self.start | ||
| 20 | elapsed = self.count/elapsed | ||
| 21 | print '%9d %9d %s' % (self.count, elapsed, self.uri) | ||
| 22 | |||
| 23 | def endDocument(self): | ||
| 24 | elapsed = time.time() - self.start | ||
| 25 | elapsed = (self.count-1)/elapsed | ||
| 26 | print '%9d %9d' % (self.count, elapsed) | ||
| 27 | |||
| 28 | # this script reads from standard input and parses it as xml | ||
| 29 | # if the xml is invalid, it will print an error message | ||
| 30 | |||
| 31 | parser = make_parser() | ||
| 32 | urilogger = UriLogger() | ||
| 33 | parser.setContentHandler(urilogger) | ||
| 34 | |||
| 35 | try: | ||
| 36 | parser.parse(sys.stdin) | ||
| 37 | except SAXException, e : | ||
| 38 | if hasattr(urilogger, 'uri'): | ||
| 39 | print "Error after "+urilogger.uri | ||
| 40 | print e; |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifdef HAVE_CONFIG_H | ||
| 21 | #include <config.h> | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #include <strigi/strigiconfig.h> | ||
| 25 | #include "xmlindexwriter.h" | ||
| 26 | #include <strigi/analyzerconfiguration.h> | ||
| 27 | #include <strigi/diranalyzer.h> | ||
| 28 | #include <strigi/fileinputstream.h> | ||
| 29 | #include <iostream> | ||
| 30 | #include <cstring> | ||
| 31 | #ifdef HAVE_UNISTD_H | ||
| 32 | #include <unistd.h> | ||
| 33 | #endif | ||
| 34 | #ifdef HAVE_DIRECT_H | ||
| 35 | #include <direct.h> | ||
| 36 | #endif | ||
| 37 | #include <stdlib.h> | ||
| 38 | #include <time.h> | ||
| 39 | |||
| 40 | using namespace std; | ||
| 41 | using namespace Strigi; | ||
| 42 | |||
| 43 | int | ||
| 44 | usage(int /*argc*/, char** argv) { | ||
| 45 | fprintf(stderr, "Usage: %s\n [--mappingfile <mappingfile>]\n" | ||
| 46 | " [--lastfiletoskip FILE]\n" | ||
| 47 | " [--stdinmtime mtime]\n [--stdinfilename filename]\n" | ||
| 48 | " [dirs-or-files-to-index]\n" | ||
| 49 | " [-j nthreads]\n", | ||
| 50 | argv[0]); | ||
| 51 | return -1; | ||
| 52 | } | ||
| 53 | bool | ||
| 54 | containsHelp(int argc, char **argv) { | ||
| 55 | for (int i=1; i<argc; ++i) { | ||
| 56 | if (strcmp(argv[i], "--help") == 0 | ||
| 57 | || strcmp(argv[i], "-h") == 0) return true; | ||
| 58 | } | ||
| 59 | return false; | ||
| 60 | } | ||
| 61 | void | ||
| 62 | analyzeFromStdin(XmlIndexManager& manager, AnalyzerConfiguration& ac, | ||
| 63 | const string& filename, time_t mtime) { | ||
| 64 | StreamAnalyzer sa(ac); | ||
| 65 | sa.setIndexWriter(*manager.indexWriter()); | ||
| 66 | FileInputStream in(stdin, filename.c_str()); | ||
| 67 | AnalysisResult result(filename, mtime, *manager.indexWriter(), sa); | ||
| 68 | sa.analyze(result, &in); | ||
| 69 | } | ||
| 70 | |||
| 71 | int | ||
| 72 | main(int argc, char **argv) { | ||
| 73 | vector<string> dirs; | ||
| 74 | int nthreads = 2; | ||
| 75 | const char* mappingfile = 0; | ||
| 76 | string lastFileToSkip; | ||
| 77 | time_t stdinMTime = time(0); | ||
| 78 | string stdinFilename = "-"; | ||
| 79 | int i = 0; | ||
| 80 | while (++i < argc) { | ||
| 81 | const char* arg = argv[i]; | ||
| 82 | if (!strcmp("-h", arg) || !strcmp("--help", arg)) { | ||
| 83 | return usage(argc, argv); | ||
| 84 | } | ||
| 85 | if (!strcmp("-j", arg)) { | ||
| 86 | if (++i == argc) { | ||
| 87 | return usage(argc, argv); | ||
| 88 | } | ||
| 89 | char* end; | ||
| 90 | nthreads = (int)strtol(argv[i], &end, 10); | ||
| 91 | if (end == argv[i] || nthreads < 1) { | ||
| 92 | return usage(argc, argv); | ||
| 93 | } | ||
| 94 | } else if (!strcmp("--mappingfile", arg)) { | ||
| 95 | if (++i == argc) { | ||
| 96 | return usage(argc, argv); | ||
| 97 | } | ||
| 98 | mappingfile = argv[i]; | ||
| 99 | } else if (!strcmp("--lastfiletoskip", arg)) { | ||
| 100 | if (++i == argc) { | ||
| 101 | return usage(argc, argv); | ||
| 102 | } | ||
| 103 | lastFileToSkip = argv[i]; | ||
| 104 | } else if (!strcmp("--stdinmtime", arg)) { | ||
| 105 | if (++i == argc) { | ||
| 106 | return usage(argc, argv); | ||
| 107 | } | ||
| 108 | char* end; | ||
| 109 | stdinMTime = strtol(argv[i], &end, 10); | ||
| 110 | if (end == argv[i] || stdinMTime < 1) { | ||
| 111 | return usage(argc, argv); | ||
| 112 | } | ||
| 113 | } else if (!strcmp("--stdinfilename", arg)) { | ||
| 114 | if (++i == argc) { | ||
| 115 | return usage(argc, argv); | ||
| 116 | } | ||
| 117 | stdinFilename = argv[i]; | ||
| 118 | } else { | ||
| 119 | const char* dir = argv[i]; | ||
| 120 | // remove trailing '/' | ||
| 121 | size_t len = strlen(dir); | ||
| 122 | if (dir[len-1] == '/') { | ||
| 123 | dirs.push_back(std::string(dir, len-1)); | ||
| 124 | } else { | ||
| 125 | dirs.push_back(dir); | ||
| 126 | } | ||
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | if (dirs.size() == 0) { | ||
| 131 | char buf[1024]; | ||
| 132 | if (getcwd(buf, 1023) == NULL) { | ||
| 133 | return -1; | ||
| 134 | } | ||
| 135 | dirs.push_back(buf); | ||
| 136 | } | ||
| 137 | |||
| 138 | vector<pair<bool,string> >filters; | ||
| 139 | filters.push_back(make_pair<bool,string>(false,".*/")); | ||
| 140 | filters.push_back(make_pair<bool,string>(false,".*")); | ||
| 141 | AnalyzerConfiguration ic; | ||
| 142 | ic.setFilters(filters); | ||
| 143 | |||
| 144 | const TagMapping mapping(mappingfile); | ||
| 145 | cout << "<?xml version='1.0' encoding='UTF-8'?>\n<" | ||
| 146 | << mapping.map("metadata"); | ||
| 147 | map<string, string>::const_iterator k = mapping.namespaces().begin(); | ||
| 148 | while (k != mapping.namespaces().end()) { | ||
| 149 | cout << " xmlns:" << k->first << "='" << k->second << "'"; | ||
| 150 | k++; | ||
| 151 | } | ||
| 152 | cout << ">\n"; | ||
| 153 | |||
| 154 | XmlIndexManager manager(cout, mapping); | ||
| 155 | DirAnalyzer analyzer(manager, ic); | ||
| 156 | for (unsigned i = 0; i < dirs.size(); ++i) { | ||
| 157 | if (dirs[i] == "-") { | ||
| 158 | analyzeFromStdin(manager, ic, stdinFilename, stdinMTime); | ||
| 159 | } else { | ||
| 160 | analyzer.analyzeDir(dirs[i], nthreads, 0, lastFileToSkip); | ||
| 161 | } | ||
| 162 | } | ||
| 163 | cout << "</" << mapping.map("metadata") << ">\n"; | ||
| 164 | |||
| 165 | return 0; | ||
| 166 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2007 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #include "xmlindexwriter.h" | ||
| 21 | using namespace std; | ||
| 22 | using namespace Strigi; | ||
| 23 | |||
| 24 | void | ||
| 25 | XmlIndexWriter::initWriterData(const FieldRegister& f) { | ||
| 26 | map<string, RegisteredField*>::const_iterator i; | ||
| 27 | map<string, RegisteredField*>::const_iterator end(f.fields().end()); | ||
| 28 | for (i = f.fields().begin(); i != end; ++i) { | ||
| 29 | Tag* tag = static_cast<Tag*>(i->second->writerData()); | ||
| 30 | if (tag) { | ||
| 31 | tag->refcount++; | ||
| 32 | continue; | ||
| 33 | } | ||
| 34 | tag = new Tag(); | ||
| 35 | tag->refcount = 1; | ||
| 36 | const string s(i->first); | ||
| 37 | const string& n = mapping.map(s); | ||
| 38 | if (s == n) { | ||
| 39 | tag->open = " <value name='" + n + "'>"; | ||
| 40 | tag->close = "</value>\n"; | ||
| 41 | } else { | ||
| 42 | tag->open = " <" + n + '>'; | ||
| 43 | tag->close = "</" + n + ">\n"; | ||
| 44 | } | ||
| 45 | i->second->setWriterData(tag); | ||
| 46 | } | ||
| 47 | } | ||
| 48 | void | ||
| 49 | XmlIndexWriter::releaseWriterData(const FieldRegister& f) { | ||
| 50 | map<string, RegisteredField*>::const_iterator i; | ||
| 51 | map<string, RegisteredField*>::const_iterator end(f.fields().end()); | ||
| 52 | for (i = f.fields().begin(); i != end; ++i) { | ||
| 53 | Tag* tag = static_cast<Tag*>(i->second->writerData()); | ||
| 54 | if (tag->refcount-- == 1) { | ||
| 55 | //fprintf(stderr, "free for %s\n", i->second->key().c_str()); | ||
| 56 | delete tag; | ||
| 57 | i->second->setWriterData(0); | ||
| 58 | } | ||
| 59 | } | ||
| 60 | } |
|   | |||
| 1 | /* This file is part of Strigi Desktop Search | ||
| 2 | * | ||
| 3 | * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info> | ||
| 4 | * | ||
| 5 | * This library is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU Library General Public | ||
| 7 | * License as published by the Free Software Foundation; either | ||
| 8 | * version 2 of the License, or (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This library is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | * Library General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU Library General Public License | ||
| 16 | * along with this library; see the file COPYING.LIB. If not, write to | ||
| 17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
| 18 | * Boston, MA 02110-1301, USA. | ||
| 19 | */ | ||
| 20 | #ifndef XMLINDEXWRITER_H | ||
| 21 | #define XMLINDEXWRITER_H | ||
| 22 | |||
| 23 | #include "tagmapping.h" | ||
| 24 | #include <strigi/indexwriter.h> | ||
| 25 | #include <strigi/indexmanager.h> | ||
| 26 | #include <strigi/analysisresult.h> | ||
| 27 | #include <strigi/fieldtypes.h> | ||
| 28 | #include <strigi/analyzerconfiguration.h> | ||
| 29 | #include <strigi/strigi_thread.h> | ||
| 30 | #include <iostream> | ||
| 31 | #include <sstream> | ||
| 32 | #include <map> | ||
| 33 | |||
| 34 | class XmlIndexWriter : public Strigi::IndexWriter { | ||
| 35 | private: | ||
| 36 | struct Data { | ||
| 37 | std::multimap<const Strigi::RegisteredField*, std::string> values; | ||
| 38 | std::string text; | ||
| 39 | }; | ||
| 40 | std::map<STRIGI_THREAD_TYPE, std::vector<Data*> > data; | ||
| 41 | struct Tag { | ||
| 42 | std::string open; | ||
| 43 | std::string close; | ||
| 44 | int refcount; | ||
| 45 | }; | ||
| 46 | |||
| 47 | STRIGI_MUTEX_DEFINE(mutex); | ||
| 48 | std::ostream& out; | ||
| 49 | |||
| 50 | const TagMapping& mapping; | ||
| 51 | |||
| 52 | void printText(const std::string& text) { | ||
| 53 | const char* p = text.c_str(); | ||
| 54 | const char* end = p + text.size(); | ||
| 55 | char nb = 0; | ||
| 56 | bool lastwhite = true; | ||
| 57 | while (p < end) { | ||
| 58 | char c = *p; | ||
| 59 | if (nb) { | ||
| 60 | if ((0xC0 & c) != 0x80) { | ||
| 61 | return; | ||
| 62 | } | ||
| 63 | out.put(c); | ||
| 64 | nb--; | ||
| 65 | } else if ((0xE0 & c) == 0xC0) { | ||
| 66 | nb = 1; | ||
| 67 | out.put(c); | ||
| 68 | } else if ((0xF0 & c) == 0xE0) { | ||
| 69 | nb = 2; | ||
| 70 | out.put(c); | ||
| 71 | } else if ((0xF8 & c) == 0xF0) { | ||
| 72 | nb = 3; | ||
| 73 | out.put(c); | ||
| 74 | } else if (c <= 8) { | ||
| 75 | return; | ||
| 76 | } else if (c == '&') { | ||
| 77 | out << "&"; | ||
| 78 | } else if (c == '<') { | ||
| 79 | out << "<"; | ||
| 80 | } else if (c == '>') { | ||
| 81 | out << ">"; | ||
| 82 | } else if (isspace(c) != 0) { | ||
| 83 | // we've to handle dos formatting | ||
| 84 | //'\r' char is ignored, it isn't wroten to out and doesn't | ||
| 85 | //change lastwhite value (so the following '\n' will be handled) | ||
| 86 | if (!lastwhite && (c!= '\r')) { | ||
| 87 | out.put(c); | ||
| 88 | lastwhite = true; | ||
| 89 | } | ||
| 90 | } else { | ||
| 91 | lastwhite = false; | ||
| 92 | out.put(c); | ||
| 93 | } | ||
| 94 | p++; | ||
| 95 | } | ||
| 96 | } | ||
| 97 | static void escape(std::string& value) { | ||
| 98 | int namp, nlt, ngt, napos, nexcept; | ||
| 99 | namp = nlt = ngt = napos = nexcept = 0; | ||
| 100 | const char* p = value.c_str(); | ||
| 101 | const char* end = p + value.size(); | ||
| 102 | char nb = 0; | ||
| 103 | while (p < end) { | ||
| 104 | char c = *p; | ||
| 105 | if (nb) { | ||
| 106 | if ((0xC0 & c) != 0x80) { | ||
| 107 | value = ""; | ||
| 108 | return; | ||
| 109 | } | ||
| 110 | nb--; | ||
| 111 | } else if ((0xE0 & c) == 0xC0) { | ||
| 112 | nb = 1; | ||
| 113 | } else if ((0xF0 & c) == 0xE0) { | ||
| 114 | nb = 2; | ||
| 115 | } else if ((0xF8 & c) == 0xF0) { | ||
| 116 | nb = 3; | ||
| 117 | } else if (c < 32 && c != 9 && c != 10 && c != 12) { | ||
| 118 | nexcept++; | ||
| 119 | } else if (c == '&') { | ||
| 120 | namp++; | ||
| 121 | } else if (c == '<') { | ||
| 122 | nlt++; | ||
| 123 | } else if (c == '>') { | ||
| 124 | ngt++; | ||
| 125 | } else if (c == '\'') { | ||
| 126 | napos++; | ||
| 127 | } | ||
| 128 | p++; | ||
| 129 | } | ||
| 130 | // if no character has to be escaped, just return | ||
| 131 | if (!(namp||nlt||ngt|napos|nexcept)) { | ||
| 132 | return; | ||
| 133 | } | ||
| 134 | |||
| 135 | std::string ov(value); | ||
| 136 | p = ov.c_str(); | ||
| 137 | end = p + ov.size(); | ||
| 138 | int newsize = (int)value.size()+4*namp+3*(nlt+ngt)+5*napos+3*nexcept; | ||
| 139 | value.clear(); | ||
| 140 | value.reserve(newsize); | ||
| 141 | while (p < end) { | ||
| 142 | char c = *p; | ||
| 143 | if (nb) { | ||
| 144 | if ((0xC0 & c) != 0x80) { | ||
| 145 | value = ""; | ||
| 146 | return; | ||
| 147 | } | ||
| 148 | nb--; | ||
| 149 | value += c; | ||
| 150 | } else if ((0xE0 & c) == 0xC0) { | ||
| 151 | nb = 1; | ||
| 152 | value += c; | ||
| 153 | } else if ((0xF0 & c) == 0xE0) { | ||
| 154 | nb = 2; | ||
| 155 | value += c; | ||
| 156 | } else if ((0xF8 & c) == 0xF0) { | ||
| 157 | nb = 3; | ||
| 158 | value += c; | ||
| 159 | } else if (c < 32 && c != 9 && c != 10 && c != 12) { | ||
| 160 | char s[4]; | ||
| 161 | snprintf(s, 4, "%%%2x", (unsigned char)c); | ||
| 162 | value += s; | ||
| 163 | } else if (c == '&') { | ||
| 164 | value += "&"; | ||
| 165 | } else if (c == '<') { | ||
| 166 | value += "<"; | ||
| 167 | } else if (c == '>') { | ||
| 168 | value += ">"; | ||
| 169 | } else if (c == '\'') { | ||
| 170 | value += "'"; | ||
| 171 | } else { | ||
| 172 | value += c; | ||
| 173 | } | ||
| 174 | p++; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | protected: | ||
| 178 | void startAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 179 | STRIGI_MUTEX_LOCK(&mutex); | ||
| 180 | std::vector<Data*>& dv = data[STRIGI_THREAD_SELF()]; | ||
| 181 | STRIGI_MUTEX_UNLOCK(&mutex); | ||
| 182 | unsigned char depth = ar->depth(); | ||
| 183 | if (depth >= dv.size()) { | ||
| 184 | dv.push_back(new Data()); | ||
| 185 | } | ||
| 186 | Data* d = dv[depth]; | ||
| 187 | ar->setWriterData(d); | ||
| 188 | } | ||
| 189 | void printValue(const Strigi::AnalyzerConfiguration& config, | ||
| 190 | const Strigi::RegisteredField* name, std::string& value) { | ||
| 191 | if (config.indexType(name) != Strigi::AnalyzerConfiguration::None) { | ||
| 192 | const Tag* tag = static_cast<const Tag*>(name->writerData()); | ||
| 193 | escape(value); | ||
| 194 | out << tag->open << value << tag->close; | ||
| 195 | } | ||
| 196 | } | ||
| 197 | void finishAnalysis(const Strigi::AnalysisResult* ar) { | ||
| 198 | STRIGI_MUTEX_LOCK(&mutex); | ||
| 199 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 200 | const Strigi::AnalyzerConfiguration& config = ar->config(); | ||
| 201 | const Strigi::FieldRegister& fr = config.fieldRegister(); | ||
| 202 | std::string v = ar->path(); | ||
| 203 | escape(v); | ||
| 204 | out << " <" << mapping.map("file") << " " << mapping.map("uri") | ||
| 205 | << "='" << v << "' " << mapping.map("mtime") << "='" | ||
| 206 | << (int)ar->mTime() | ||
| 207 | << "'>\n"; | ||
| 208 | |||
| 209 | if (ar->encoding().size()) { | ||
| 210 | v.assign(ar->encoding()); | ||
| 211 | printValue(config, fr.encodingField, v); | ||
| 212 | } | ||
| 213 | |||
| 214 | std::multimap<const Strigi::RegisteredField*, std::string>::iterator | ||
| 215 | i, end; | ||
| 216 | end = d->values.end(); | ||
| 217 | for (i = d->values.begin(); i != end; ++i) { | ||
| 218 | printValue(config, i->first, i->second); | ||
| 219 | } | ||
| 220 | std::ostringstream oss; | ||
| 221 | oss << (int)ar->depth(); | ||
| 222 | v = oss.str(); | ||
| 223 | printValue(config, fr.embeddepthField, v); | ||
| 224 | if (d->text.size() > 0) { | ||
| 225 | out << " <text>"; | ||
| 226 | printText(d->text); | ||
| 227 | out << "</text>\n"; | ||
| 228 | } | ||
| 229 | out << " </" << mapping.map("file") << ">\n"; | ||
| 230 | STRIGI_MUTEX_UNLOCK(&mutex); | ||
| 231 | d->values.clear(); | ||
| 232 | d->text.assign(""); | ||
| 233 | } | ||
| 234 | void addText(const Strigi::AnalysisResult* ar, const char* text, | ||
| 235 | int32_t length) { | ||
| 236 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 237 | if (d->text.size() < 10000000) { | ||
| 238 | d->text.append(text, length); | ||
| 239 | d->text.append("\n"); | ||
| 240 | } | ||
| 241 | } | ||
| 242 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 243 | const Strigi::RegisteredField* field, const std::string& value) { | ||
| 244 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 245 | d->values.insert( | ||
| 246 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 247 | field, value)); | ||
| 248 | } | ||
| 249 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 250 | const Strigi::RegisteredField* field, | ||
| 251 | const unsigned char* data, uint32_t size) { | ||
| 252 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 253 | d->values.insert( | ||
| 254 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 255 | field, std::string((const char*)data, size))); | ||
| 256 | } | ||
| 257 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 258 | const Strigi::RegisteredField* field, uint32_t value) { | ||
| 259 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 260 | static std::ostringstream v; | ||
| 261 | v.str(""); | ||
| 262 | v << value; | ||
| 263 | d->values.insert( | ||
| 264 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 265 | field, v.str())); | ||
| 266 | } | ||
| 267 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 268 | const Strigi::RegisteredField* field, int32_t value) { | ||
| 269 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 270 | static std::ostringstream v; | ||
| 271 | v.str(""); | ||
| 272 | v << value; | ||
| 273 | d->values.insert( | ||
| 274 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 275 | field, v.str())); | ||
| 276 | } | ||
| 277 | void addValue(const Strigi::AnalysisResult* ar, | ||
| 278 | const Strigi::RegisteredField* field, double value) { | ||
| 279 | Data* d = static_cast<Data*>(ar->writerData()); | ||
| 280 | static std::ostringstream v; | ||
| 281 | v.str(""); | ||
| 282 | v << value; | ||
| 283 | d->values.insert( | ||
| 284 | std::make_pair<const Strigi::RegisteredField* const, std::string>( | ||
| 285 | field, v.str())); | ||
| 286 | } | ||
| 287 | void addTriplet(const std::string& subject, | ||
| 288 | const std::string& predicate, const std::string& object) {} | ||
| 289 | void addValue(const Strigi::AnalysisResult*, | ||
| 290 | const Strigi::RegisteredField* field, const std::string& name, | ||
| 291 | const std::string& value) {} | ||
| 292 | void initWriterData(const Strigi::FieldRegister&); | ||
| 293 | void releaseWriterData(const Strigi::FieldRegister&); | ||
| 294 | public: | ||
| 295 | explicit XmlIndexWriter(std::ostream& o, const TagMapping& m) | ||
| 296 | :out(o), mapping(m) { | ||
| 297 | STRIGI_MUTEX_INIT(&mutex); | ||
| 298 | } | ||
| 299 | ~XmlIndexWriter() { | ||
| 300 | std::map<STRIGI_THREAD_TYPE, std::vector<Data*> >::const_iterator j; | ||
| 301 | for (j = data.begin(); j != data.end(); ++j) { | ||
| 302 | std::vector<Data*>::const_iterator i; | ||
| 303 | for (i = j->second.begin(); i != j->second.end(); ++i) { | ||
| 304 | delete *i; | ||
| 305 | } | ||
| 306 | } | ||
| 307 | STRIGI_MUTEX_DESTROY(&mutex); | ||
| 308 | } | ||
| 309 | void commit() {} | ||
| 310 | void deleteEntries(const std::vector<std::string>& entries) {} | ||
| 311 | void deleteAllEntries() {} | ||
| 312 | }; | ||
| 313 | |||
| 314 | class XmlIndexManager : public Strigi::IndexManager { | ||
| 315 | private: | ||
| 316 | XmlIndexWriter writer; | ||
| 317 | public: | ||
| 318 | XmlIndexManager(std::ostream& o, const TagMapping& m) :writer(o, m) {} | ||
| 319 | Strigi::IndexWriter* indexWriter() { | ||
| 320 | return &writer; | ||
| 321 | } | ||
| 322 | Strigi::IndexReader* indexReader() { | ||
| 323 | return 0; | ||
| 324 | } | ||
| 325 | }; | ||
| 326 | |||
| 327 | #endif |

