hpclab
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎cfg.py
+13-7 b/‎cfg.py
+13-7
diff --git a/‎raw/frequent_terms.txt.gz renamed to ‎data/raw/frequent_terms.txt.gz b/‎raw/frequent_terms.txt.gz renamed to ‎data/raw/frequent_terms.txt.gz
diff --git a/‎lib/cpp/buffered_stream b/‎lib/cpp/buffered_stream
diff --git a/‎lib/cpp/pattern_matching b/‎lib/cpp/pattern_matching
diff --git a/‎lib/cython/setup.py
+55 b/‎lib/cython/setup.py
+55
diff --git a/‎lib/python/efficient_query_expansion/__init__.py b/‎lib/python/efficient_query_expansion/__init__.py
diff --git a/‎lib/python/documents_utils.py renamed to ‎lib/python/efficient_query_expansion/documents_utils.py
+25-2 b/‎lib/python/documents_utils.py renamed to ‎lib/python/efficient_query_expansion/documents_utils.py
+25-2
diff --git a/‎lib/python/efficient_query_expansion/index_cache.py
+177 b/‎lib/python/efficient_query_expansion/index_cache.py
+177
diff --git a/‎lib/python/normalize_text.py renamed to ‎lib/python/efficient_query_expansion/normalize_text.py b/‎lib/python/normalize_text.py renamed to ‎lib/python/efficient_query_expansion/normalize_text.py
diff --git a/‎lib/python/efficient_query_expansion/utils.py
+44 b/‎lib/python/efficient_query_expansion/utils.py
+44
@@ -13,3 +13,6 @@
 [submodule "lib/cpp/pattern_matching"]
 	path = lib/cpp/pattern_matching
 	url = https://github.com/roberto-trani/pattern_matching.git
+[submodule "lib/cpp/buffered_stream"]
+	path = lib/cpp/buffered_stream
+	url = https://github.com/roberto-trani/buffered_stream.git
@@ -4,18 +4,24 @@
 
 # base directory
 base_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-# add the python libraries to the sys path
-sys.path.append(base_dir + "lib/python/")
-sys.path.append(base_dir + "lib/cpp/")
+data_dir = base_dir + "data/"
+lib_dir  = base_dir + "lib/"
+
+# add the python libraries to the python path
+sys.path.append(lib_dir + "python/")
+sys.path.append(lib_dir + "cpp/")
+sys.path.append(lib_dir + "cython/")
 
 # other directories
-raw_dir = base_dir + "raw/"
-processed_dir = base_dir + "processed/"
-thesaurus_dir = base_dir + "thesaurus/"
+processed_dir = data_dir + "processed/"
+raw_dir       = data_dir + "raw/"
+thesaurus_dir = data_dir + "thesaurus/"
+tmp_dir       = data_dir + "tmp/"
 
 # number of parts the wikipedia file must be splitted to
 wiki_preprocessing_split_into = 10
 
 # some checks for consistency
 assert os.path.isdir(base_dir)
-assert os.path.isdir(raw_dir)
+assert os.path.isdir(data_dir)
+assert os.path.isdir(lib_dir)
@@ -0,0 +1,55 @@
+import numpy
+import os
+import sys
+from Cython.Build import cythonize
+from distutils.core import setup, Extension
+
+
+def add_extension(extensions, name, **kwargs):
+    assert isinstance(name, str) and "/" not in name
+    assert name not in extensions
+
+    source = name.replace(".", "/") + ".pyx"
+    if "language" not in kwargs:
+        kwargs["language"] = "c++"
+    if "extra_compile_args" in kwargs:
+        kwargs["extra_compile_args"] += ["-std=c++11", "-O3"]
+    else:
+        kwargs["extra_compile_args"] = ["-std=c++11", "-O3"]
+
+    extensions[name] = Extension(
+        name,
+        sources=[source],
+        **kwargs
+    )
+
+
+if __name__ ==  "__main__":
+    sys.path.append("../../")
+    import cfg
+    extensions = e = dict()
+
+    # set the compiler
+    os.environ["CC"] = "g++-7"
+
+    # collection_stats
+    kwargs = {"include_dirs": []}
+    kwargs["include_dirs"].append(cfg.lib_dir + "cpp")
+    kwargs["include_dirs"].append(cfg.lib_dir + "cpp/pattern_matching")
+    add_extension(e, 'collection_stats.collection_stats', extra_link_args=['-fopenmp'], extra_compile_args=['-fopenmp'], **kwargs)
+    add_extension(e, 'collection_stats.collection_stats_restricted', extra_link_args=['-fopenmp'], extra_compile_args=['-fopenmp'], **kwargs)
+    # featurizers
+    kwargs["include_dirs"].append(numpy.get_include())
+    add_extension(e, 'feature_extraction.featurizer_textual', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_tags', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_w2v', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_sigir08', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_sigir08extended', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_custom', **kwargs)
+    add_extension(e, 'feature_extraction.featurizer_qpp', **kwargs)
+
+    # setup
+    setup(
+        ext_modules=cythonize(extensions.values()),
+        packages=extensions.keys(),
+    )
@@ -1,6 +1,11 @@
+import codecs
+import gzip
 import nltk
-from utils import get_reader, get_emitter_from_generator
+import os.path
+import sys
+
 from normalize_text import normalize_text, normalize_text_step_1
+from parallel_stream.utils import get_emitter_from_iterable
 
 
 class Doc(object):
@@ -164,6 +169,24 @@ def _xml_extractor_reader_to_doc_generator(reader):
         raise Exception("A content was expected before the end of file")
 
 
+def get_reader(infilename, encoding=None):
+    if infilename == "-":
+        reader = sys.stdin
+    else:
+        if not os.path.isfile(infilename):
+            raise Exception("File {} doesn't exist".format(infilename))
+
+        if infilename.endswith(".gz"):
+            reader = gzip.open(infilename, "rb")
+        else:
+            reader = open(infilename, "r")
+
+    if encoding is None or encoding.upper() == "ASCII":
+        return reader
+
+    return codecs.getreader(encoding)(reader)
+
+
 def doc_generator_from_file(infilenames, encoding=None, file_format="custom"):
     if isinstance(infilenames, (str, unicode)):
         infilenames = [infilenames]
@@ -197,7 +220,7 @@ def sentence_generator_from_doc_file(*args, **kwargs):
 
 
 def get_doc_emitter_from_files(*args, **kwargs):
-    return get_emitter_from_generator(doc_generator_from_file(*args, **kwargs))
+    return get_emitter_from_iterable(doc_generator_from_file(*args, **kwargs))
 
 
 def get_doc_normalizer_worker():
 
@@ -0,0 +1,177 @@
+import cPickle
+import collections
+import json
+import socket
+import struct
+
+from utils import query_repr_to_sql_query
+
+
+QueryPerformanceSubset = collections.namedtuple(
+    "QueryPerformanceSubset",
+    ["num_ret", "exe_time"]
+)
+QueryPerformance = collections.namedtuple(
+    "QueryPerformance",
+    ["num_ret", "num_rel", "num_rel_ret", "exe_time"]
+)
+
+
+class SocketChannel(object):
+    _length_format = "<I"
+    _length_size = struct.calcsize(_length_format)
+
+    def __init__(self, host, port):
+        self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self._sock.connect((host, port))
+
+    def close(self):
+        self._sock.close()
+
+    def send_request(self, request):
+        assert isinstance(request, dict)
+        self._send_msg(json.dumps(request))
+
+    def receive_reply(self):
+        return json.loads(self._recv_msg())
+
+    def _recvall(self, n):
+        # Helper function to recv n bytes or return None if EOF is hit
+        data = b''
+        while len(data) < n:
+            packet = self._sock.recv(n - len(data))
+            if not packet:
+                return None
+            data += packet
+        return data
+
+    def _send_msg(self, msg):
+        # Prefix each message with a 4-byte length (network byte order)
+        msg = struct.pack(SocketChannel._length_format, len(msg)) + msg
+        self._sock.sendall(msg)
+
+    def _recv_msg(self):
+        # Read message length and unpack it into an integer
+        raw_msglen = self._recvall(SocketChannel._length_size)
+        if not raw_msglen:
+            return None
+        msglen = struct.unpack(SocketChannel._length_format, raw_msglen)[0]
+        # Read the message data
+        return self._recvall(msglen)
+
+
+class IndexCursor(object):
+    def __init__(self, index_cache, db_cursor):
+        self._index_cache = index_cache
+        self._db_cursor = db_cursor
+
+    def close(self):
+        self._db_cursor.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc_info):
+        self.close()
+
+    def get_performance(self, query_repr, document_id_list=None, document_id_list_key=None,
+                         include_time=True, force=False):
+        # check parameters
+        assert isinstance(query_repr, (list, tuple))
+        assert document_id_list is None or (isinstance(document_id_list, (list, tuple)) and len(document_id_list) > 0 and all(isinstance(doc_id, (int, long)) for doc_id in document_id_list))
+        assert document_id_list_key is None or isinstance(document_id_list_key, (int, long))
+        assert (document_id_list_key is None) == (document_id_list is None)
+        assert isinstance(include_time, bool)
+        assert isinstance(force, bool)
+
+        zero_document_id_list = (document_id_list is None) or (len(document_id_list) == 0)
+
+        # transform the query representation in a query string
+        sql_str = query_repr_to_sql_query(query_repr)
+
+        # get the entry from the cache
+        key = sql_str if zero_document_id_list else (sql_str, document_id_list_key)
+        if not force:
+            query_performance = self._index_cache._get(key)
+            if query_performance is not None and (not include_time or query_performance.exe_time is not None):
+                return query_performance
+
+        # transform the document_id_list
+        document_id_list = [] if document_id_list is None else list(set(document_id_list))
+
+        request = {
+            "query": sql_str,
+            "query_type": "cnf"
+        }
+        if not zero_document_id_list:
+            request["rel"] = document_id_list
+        self._db_cursor.send_request(request)
+
+        result = self._db_cursor.receive_reply()
+        if "error" in result:
+            raise Exception(result["error"])
+
+        # compose the resulting object
+        if zero_document_id_list:
+            query_performance = QueryPerformanceSubset(
+                num_ret=int(result["num_ret"]),
+                exe_time=float(result["exe_time"])
+            )
+        else:
+            query_performance = QueryPerformance(
+                num_ret=int(result["num_ret"]),
+                num_rel=int(result["num_rel"]),
+                num_rel_ret=int(result["num_rel_ret"]),
+                exe_time=float(result["exe_time"])
+            )
+
+        # put the result into the cache
+        self._index_cache._put(key, query_performance)
+        if not zero_document_id_list:
+            qps = self._index_cache._get(key[0])
+            if qps is None or (include_time and qps.exe_time is None):
+                qps = QueryPerformanceSubset(
+                    num_ret=query_performance.num_ret,
+                    exe_time=query_performance.exe_time
+                )
+                self._index_cache._put(key[0], qps)
+
+        # return
+        return query_performance
+
+
+class IndexCache(object):
+    def __init__(self, host, port):
+        assert isinstance(host, str)
+        assert isinstance(port, int)
+
+        self._host = host
+        self._port = port
+        self._cache = dict()
+
+    @staticmethod
+    def load(file_path):
+        host, port, cache = cPickle.load(open(file_path, "rb"))
+        index_cache = IndexCache(host, port)
+        index_cache._cache = cache
+        return index_cache
+
+    def dump(self, file_path):
+        cPickle.dump(
+            (self._host, self._port, self._cache),
+            open(file_path, "wb"),
+            protocol=cPickle.HIGHEST_PROTOCOL
+        )
+
+    def __len__(self):
+        return len(self._cache)
+
+    def _get(self, key):
+        return self._cache.get(key, None)
+
+    def _put(self, key, value):
+        self._cache[key] = value
+
+    def cursor(self):
+        connection = SocketChannel(host=self._host, port=self._port)
+        return IndexCursor(self, connection)
@@ -0,0 +1,44 @@
+def query_repr_to_sql_query(query_repr, uniq_repr=True):
+    join_fun = (
+        lambda l, m, r, it: "{}{}{}".format(
+            l,
+            m.join(sorted(set(it)) if uniq_repr else it),
+            r
+        )
+    )
+
+    return \
+        join_fun("(", ") | (", ")", (
+            join_fun("(", ") (", ")", (
+                join_fun("", " | ", "", (
+                    "\"{}\"".format(syn_tag[0]) if " " in syn_tag[0] else syn_tag[0]
+                    for syn_tag in synset
+                ))
+                for synset in and_query
+            ))
+            for and_query in query_repr
+        ))
+
+
+def sql_query_to_query_repr(sql_query):
+    assert sql_query[:2] == "((" and sql_query[-2:] == "))"
+
+    query_repr = \
+        [
+            [
+                [
+                    (syn[1:-1] if (syn[0] == syn[-1] == "\"") else syn, )
+                    for syn in synset.split(" | ")
+                ]
+                for synset in and_query.split(") (")
+            ]
+            for and_query in sql_query[2:-2].split(")) | ((")
+        ]
+
+    assert all(
+        " " not in syn_tag[0] or syn_tag[0].find("\"", 1, -1) == -1
+        for and_query in query_repr
+        for synset in and_query
+        for syn_tag in synset
+    )
+    return query_repr