rfdiazpr
diff --git a/‎AUTHORS
+1 b/‎AUTHORS
+1
diff --git a/‎CHANGELOG
+7 b/‎CHANGELOG
+7
diff --git a/‎DEPENDENCIES
+4 b/‎DEPENDENCIES
+4
diff --git a/‎README
+36 b/‎README
+36
diff --git a/‎doc/Makefile
+88 b/‎doc/Makefile
+88
diff --git a/‎doc/source/TODO.rst
+20 b/‎doc/source/TODO.rst
+20
diff --git a/‎doc/source/algorithm.rst
+158 b/‎doc/source/algorithm.rst
+158
diff --git a/‎doc/source/api.rst
+68 b/‎doc/source/api.rst
+68
@@ -0,0 +1 @@
+Oscar Celma (ocelma __at__ gmail __dot__ com), http://ocelma.net
@@ -0,0 +1,7 @@
+===========
+Version 0.1
+===========
+
+2011-10-08
+
+    * Added the whole project at github
@@ -0,0 +1,4 @@
+divisi2
+csc-pysparse
+numpy
+scipy
@@ -0,0 +1,36 @@
+=============
+python-recsys
+=============
+
+A python library for implementing a recommender system.
+
+==================
+INSTALLATION NOTES
+==================
+
+1) Dependencies
+
+pyrecsys is build on top of Divisi2, with csc-pysparse (Divisi2 also requires NumPy).
+pyrecsys also requires SciPy.
+
+To install the dependencies do something like this (Ubuntu):
+
+    sudo apt-get install python-scipy
+    sudo apt-get install python-numpy
+    sudo pip install divisi2 csc-pysparse
+
+    # If you don't have pip installed then do:
+    # sudo easy_install csc-pysparse
+    # sudo easy_install divisi2
+
+2) Download
+
+Download pyrecsys from github: https://github.com/ocelma/python-recsys
+
+3) Install
+
+    tar xvfz pyrecsys.tar.gz
+    cd pyrecsys
+    sudo python setup.py install
+
+..and you're all set! (hopefully)
@@ -0,0 +1,88 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html      to make standalone HTML files"
+	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  pickle    to make pickle files"
+	@echo "  json      to make JSON files"
+	@echo "  htmlhelp  to make HTML files and a HTML help project"
+	@echo "  qthelp    to make HTML files and a qthelp project"
+	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  changes   to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck to check all external links for integrity"
+	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf build/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
+	@echo
+	@echo "Build finished. The HTML pages are in build/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) build/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in build/dirhtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) build/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in build/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) build/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in build/qthelp, like this:"
+	@echo "# qcollectiongenerator build/qthelp/Recommender.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile build/qthelp/Recommender.qhc"
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in build/latex."
+	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
+	      "run these through (pdf)latex."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
+	@echo
+	@echo "The overview file is in build/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in build/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) build/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in build/doctest/output.txt."
@@ -0,0 +1,20 @@
+TODO
+=====
+
+* divisi2.make_sparse is way too slow! Need to change the way how the matrix is created
+
+* Too much memory consumption using divisi2 functions. Remove dependencies from divisi2?
+
+* algorithms: Gradient Descent rsvd?
+
+* evaluation: add DCG
+
+* Mention other Python approaches:
+
+  * pysuggest http://code.google.com/p/pysuggest/
+
+  * pyrsvd http://code.google.com/p/pyrsvd/
+
+  * crab https://github.com/marcelcaraciolo/crab
+
+  * pyflix http://pyflix.python-hosting.com/
@@ -0,0 +1,158 @@
+Algorithms
+==========
+
+**pyrecsys** provides, *out of the box*, some basic algorithms based on matrix factorization.
+
+SVD
+---
+
+**pyrecsys** makes use of `SVD`_ in order to decompose the input data (a matrix).
+Once the matrix is *reduced* into a lower dimensional space, **pyrecsys** can provide
+predictions, recommendations and similarity among the "elements" (being either users or
+items -it's just a matter of how you load the matrix data-).
+
+.. _`SVD`: http://en.wikipedia.org/wiki/Singular_value_decomposition
+
+Loading data
+~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from recsys.algorithm.factorize import SVD
+
+    filename = './data/movielens/ratings.dat'
+    svd = SVD()
+    svd.load_data(filename=filename, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
+
+.. code-block:: python
+
+    from recsys.datamodel.data import Data
+    from recsys.algorithm.factorize import SVD
+    
+    filename = './data/movielens/ratings.dat'
+    data = Data()
+    format = {'col':0, 'row':1, 'value':2, 'ids': int}
+    data.load(filename, sep='::', format=format)
+    train, test = data.split_train_test(percent=80) # 80% train, 20% test
+
+    svd = SVD()
+    svd.set_data(train)
+
+Computing
+~~~~~~~~~
+
+    >>> K=100
+    >>> svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None)
+
+Parameters:
+
+    *min_values*: remove those rows or columns (from the input matrix) that has less than 'min_values' non-zeros
+
+    *pre_normalize*: normalize input matrix. Possible values are *tfidf*, *rows*, *cols*, *all*.
+
+        **tfidf**: By default, treats the matrix as terms-by-documents. It's important, then, how the data is loaded. Use the *format* param in *svd.load_data()* to determine the order of the fields of the input file.
+
+        **rows**: Rescales the rows of the input matrix so that they all have unit Euclidean magnitude
+
+        **cols**: Rescales the columns of the input matrix so that they all have unit Euclidean magnitude
+
+        **all**: Rescales the rows and columns of the input matrix, by dividing both the rows and the columns by the square root of their Euclidean norm
+
+    *mean_center*: centering the input matrix (aka mean substraction)
+
+    *post_normalize*: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]`
+
+    *savefile*: Output file to store SVD transformation (:math:`U, \Sigma, V^T` vectors)
+
+Predictions
+~~~~~~~~~~~~~~~
+
+To predict a *rating*, :math:`\hat{r}_{ui}`, SVD class reconstructs the original matrix, :math:`M^\prime = U \Sigma_k V^T`
+
+Then, 
+
+    >>> svd.predict(ITEMID, USERID, MIN_RATING=0.0, MAX_RATING=5.0)
+
+equals to:
+
+.. math:: 
+
+    \hat{r}_{ui} = M^\prime_{ij}
+
+Recommendations
+~~~~~~~~~~~~~~~
+
+Recommendations (i.e. unknown values in :math:`M_{ij}`) are also derived from :math:`M^\prime = U \Sigma_k V^T`. In this case, 
+
+    >>> svd.recommend(USERID, n=10, only_unknowns=True, is_row=False)
+
+returns the higher values of :math:`M^\prime_{i \cdot}` :math:`\forall_j{M_{ij}=\emptyset}`, whilst
+
+    >>> svd.recommend(USERID, n=10, only_unknowns=False, is_row=False)
+
+returns the higher values for the user
+
+Neighbourhood SVD
+-----------------
+
+Classic Neighbourhood algorithm uses the ratings of the similar users (or
+items) to predict the values of the input matrix *M*.
+
+.. code-block:: python
+
+    from recsys.algorithm.factorize import SVDNeighbourhood
+
+    svd = SVDNeighbourhood()
+    svd.load_data(filename=sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
+    K=100
+    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)
+
+Predictions
+~~~~~~~~~~~
+
+The only difference with *plain* SVD is the way how it computes the predictions :math:`\hat{r}_{ui}`
+
+    >>> svd.predict(ITEMID, USERID, weighted=True, MIN_VALUE=0.0, MAX_VALUE=5.0)
+
+To compute the prediction, it uses this equation (u=USERID, i=ITEMID):
+
+.. math:: 
+
+    \hat{r}_{ui} = \frac{\sum_{j \in S^{k}(i;u)} s_{ij} r_{uj}}{\sum_{j \in S^{k}(i;u)} s_{ij}}
+
+where
+
+:math:`S^k(i; u)` denotes the set of :math:`k` items rated by :math:`u`, which are most similar to :math:`i`. 
+
+* To compute the :math:`k` items most similar to :math:`i`, it uses the *svd.similar(i)* method. Then it gets those items that user :math:`u` has already rated
+
+:math:`s_{ij}` is the similarity between :math:`i` and :math:`j`, computed using *svd.similarity(i, j)*
+
+Comparison
+----------
+
+For those who love RMSE, MAE and the like, here are some numbers comparing both SVD approaches.
+The evaluation uses the `Movielens`_ 1M ratings dataset, splitting the train/test dataset with ~80%-20%.
+
+.. _`Movielens`: http://www.grouplens.org/node/73
+
+.. note::
+
+    Computing svd k=100, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True
+
+.. warning::
+
+    Because of *min_values=5*, some rows (movies) or columns (users) in the input matrix are removed. In fact, those movies that had less than 5 users who rated it, and those users that rated less than 5 movies are removed.
+
+Results
+~~~~~~~
+
+# Ratings in the Test dataset: 209,908
+
++-----------+--------+----------------+
+|           | **SVD**| **SVD Neigh.** | 
++-----------+--------+----------------+
+| **RMSE**  | 0.91811| 0.875496       |
++-----------+--------+----------------+
+| **MAE**   | 0.71703| 0.684173       |
++-----------+--------+----------------+
@@ -0,0 +1,68 @@
+===
+API
+===
+
+.. automodule:: recsys
+
+Algorithms
+==========
+
+See some usage examples `here <algorithm.html>`_
+
+Baseclass
+---------
+
+.. autoclass:: recsys.algorithm.baseclass.Algorithm
+    :members: 
+
+SVD
+---
+
+.. autoclass:: recsys.algorithm.factorize.SVD
+    :members: 
+
+SVD Neighbourhood
+-----------------
+
+.. autoclass:: recsys.algorithm.factorize.SVDNeighbourhood
+    :members: 
+
+.. SVD Neighbourhood Koren
+.. -----------------------
+
+.. .. autoclass:: recsys.algorithm.factorize.SVDNeighbourhoodKoren
+..     :members: 
+
+Evaluation
+==========
+
+See some `examples <evaluation.html>`_
+
+.. autoclass:: recsys.evaluation.baseclass.Evaluation
+    :members: 
+
+Data Model
+==========
+
+**pyrecsys** data model includes: users, items, and its interaction.
+See some `datamodel examples <datamodel.html>`_
+
+Item
+----
+
+.. autoclass:: recsys.datamodel.item.Item
+    :members: 
+
+User
+----
+
+.. autoclass:: recsys.datamodel.user.User
+    :members: 
+
+Data
+----
+
+.. autoclass:: recsys.datamodel.data.Data
+    :members: 
+
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Oscar Celma (ocelma __at__ gmail __dot__ com), http://ocelma.net`
-Original file line number
+Diff line change
 +divisi2
 +csc-pysparse
 +numpy
 +scipy