carlosmarte
diff --git a/‎.gitignore
+176 b/‎.gitignore
+176
diff --git a/‎README.md
+111 b/‎README.md
+111
diff --git a/‎__tests__/testdata/data01.csv
+4 b/‎__tests__/testdata/data01.csv
+4
diff --git a/‎__tests__/testdata/data02.csv
+3 b/‎__tests__/testdata/data02.csv
+3
diff --git a/‎__tests__/testdata/data03.csv
+3 b/‎__tests__/testdata/data03.csv
+3
diff --git a/‎__tests__/testdata/track01.csv
+5 b/‎__tests__/testdata/track01.csv
+5
diff --git a/‎__tests__/testdata/track02.csv
+3 b/‎__tests__/testdata/track02.csv
+3
@@ -0,0 +1,176 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+*.tmp*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
@@ -0,0 +1,111 @@
+# CSV Analyzer with Grouping
+
+A Python utility for loading, analyzing, and grouping CSV files based on common columns. This tool is particularly useful when you need to process multiple CSV files and group their data based on a specific column while maintaining the original column structure.
+
+## Features
+
+- Load CSV files from a directory or specific file paths
+- Group data by any common column across CSV files
+- Preserve original column names without modification
+- Track source files in the output
+- Handle both matched and unmatched files separately
+- Export results to organized CSV files
+
+## Requirements
+
+- Python 3.6+
+- pandas
+- pathlib
+
+## Installation
+
+1. Clone this repository or copy the `csv_analyzer.py` file to your project
+2. Install required dependencies:
+
+```bash
+pip install pandas
+```
+
+## Usage
+
+### Basic Usage
+
+```python
+from csv_analyzer import CSVAnalyzerGrouping
+
+# Initialize the analyzer
+analyzer = CSVAnalyzerGrouping()
+
+# Load CSV files from a directory
+analyzer.load_from_directory("path/to/your/csvs")
+
+# Or load specific CSV files
+analyzer.load_from_files(["file1.csv", "file2.csv"])
+
+# Group data by a specific column
+result = analyzer.grouped_data_by_column("category")
+
+# Export the results
+output_dir = ".tmp"
+analyzer.export_matched_data(output_dir, result, "grouped_by_category")
+analyzer.export_unmatched_data(output_dir, result)
+```
+
+### Output Structure
+
+The tool will create:
+
+- A combined CSV file containing all grouped data with original columns plus a source_file column
+- The source_file column will always be the last column in the output
+- Original column names are preserved without any aggregation suffixes
+
+### Example Output Format
+
+For input CSV files containing columns: `name,category,link,tag,label,id,x_path`
+
+The output will maintain the same structure with source_file added as the last column:
+
+```
+name,category,link,tag,label,id,x_path,source_file
+```
+
+## Methods
+
+### `load_from_directory(path: str)`
+
+Loads all CSV files from the specified directory.
+
+### `load_from_files(files: List[str])`
+
+Loads specific CSV files from the provided file paths.
+
+### `grouped_data_by_column(column_name: str)`
+
+Groups data by the specified column for files that contain it.
+
+### `export_matched_data(output_dir: str, dataset: Dict, output_prefix: str)`
+
+Exports matched (grouped) data to a single combined CSV file.
+
+### `export_unmatched_data(output_dir: str, dataset: Dict, output_prefix: str)`
+
+Exports unmatched data to separate CSV files.
+
+## Error Handling
+
+The tool includes comprehensive error handling for:
+
+- Invalid directory paths
+- File reading errors
+- Grouping operation failures
+- Export errors
+
+Each operation provides clear feedback through console messages.
+
+## Contributing
+
+Feel free to submit issues, fork the repository, and create pull requests for any improvements.
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,4 @@
+name,category,link,tag,label,id
+data01-name-1,data01-category-1,data01-l-1,data01-t-1,data01-lab-1,data01-id-1
+data01-name-1,data01-category-1,data01-l-1,data01-t-1,data01-lab-1,data01-id-1
+data01-name-2,data01-category-2,data01-l-2,data01-t-2,data01-lab-2,data01-id-2
@@ -0,0 +1,3 @@
+name,category,link,tag,label,id,x_path
+data02-name-1,data02-category-1,data02-l-1,data02-t-1,data02-lab-1,data02-id-1,x
+data02-name-2,data02-category-2,data02-l-2,data02-t-2,data02-lab-2,data02-id-2,y
@@ -0,0 +1,3 @@
+name,category,link,tag,label,id
+data03-name-1,data03-category-1,data03-l-1,data03-t-1,data03-lab-1,data03-id-1
+data03-name-2,data03-category-2,data03-l-2,data03-t-2,data03-lab-2,data03-id-2
@@ -0,0 +1,5 @@
+name,description,link,tag,label,id
+track01-name-1,track01-d-1,track01-l-1,track01-t-1,track01-lab-1,track01-id-1
+track01-name-2,track01-d-2,track01-l-2,track01-t-2,track01-lab-2,track01-id-2
+track01-name-3,track01-d-3,track01-l-3,track01-t-3,track01-lab-3,track01-id-3
+track01-name-3,track01-d-4,track01-l-4,track01-t-4,track01-lab-4,track01-id-4
@@ -0,0 +1,3 @@
+name,description,link,tag,label,id
+track02-name-1,track02-d-1,track02-l-1,track02-t-1,track02-lab-1,track02-id-1
+track02-name-2,track02-d-2,track02-l-2,track02-t-2,track02-lab-2,track02-id-2
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+name,category,link,tag,label,id,x_path`
	`2`	`+data02-name-1,data02-category-1,data02-l-1,data02-t-1,data02-lab-1,data02-id-1,x`
	`3`	`+data02-name-2,data02-category-2,data02-l-2,data02-t-2,data02-lab-2,data02-id-2,y`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+name,category,link,tag,label,id`
	`2`	`+data03-name-1,data03-category-1,data03-l-1,data03-t-1,data03-lab-1,data03-id-1`
	`3`	`+data03-name-2,data03-category-2,data03-l-2,data03-t-2,data03-lab-2,data03-id-2`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+name,description,link,tag,label,id`
	`2`	`+track02-name-1,track02-d-1,track02-l-1,track02-t-1,track02-lab-1,track02-id-1`
	`3`	`+track02-name-2,track02-d-2,track02-l-2,track02-t-2,track02-lab-2,track02-id-2`