diff --git a/.gitignore b/.gitignore index ded31e2..d59c406 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ config.toml docker-compose.yml .idea firebase.json +dumps +full_skills_unique.json +filtered_vacancies.csv # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/src/dataset/dump_data.py b/src/dataset/dump_data.py new file mode 100755 index 0000000..40cc96a --- /dev/null +++ b/src/dataset/dump_data.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import subprocess +from pathlib import Path +from template_project.web_api.configuration import load_configuration + + +def main() -> None: + config_path = Path("config.toml") + configuration = load_configuration(config_path) + + db_url = str(configuration.database.url.get_value()) + db_url = db_url.replace("postgresql+psycopg://", "postgresql://") + + output_dir = Path("dumps") + output_dir.mkdir(exist_ok=True) + + output_file = output_dir / "data_dump.sql" + + print("Создание дампа таблиц vacancy, vacancy_embedding, key_skills...") + + subprocess.run( + [ + "pg_dump", + db_url, + "--table=vacancy", + "--table=vacancy_embedding", + "--table=key_skills", + "--data-only", + "--column-inserts", + f"--file={output_file}", + ], + check=True, + ) + + print(f"\nДамп создан: {output_file}") + print(f"Размер файла: {output_file.stat().st_size / 1024 / 1024:.2f} MB") + print("\nДля импорта на прод сервере выполните:") + print(f" psql -f {output_file}") + + +if __name__ == "__main__": + main() + diff --git a/src/dataset/dump_data.sh b/src/dataset/dump_data.sh new file mode 100755 index 0000000..409a509 --- /dev/null +++ b/src/dataset/dump_data.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +DB_URL="${DATABASE_URL:-postgresql://user:password@localhost:5432/dbname}" + +echo "Создание дампа таблиц vacancy, vacancy_embedding, key_skills..." + +pg_dump "$DB_URL" \ + --table=vacancy \ + --table=vacancy_embedding \ + --table=key_skills \ + --data-only \ + --column-inserts \ + --file=dump_data.sql + +echo "Дамп создан: dump_data.sql" +echo "Размер файла: $(du -h dump_data.sql | cut -f1)" + diff --git a/src/dataset/load_dump.sh b/src/dataset/load_dump.sh new file mode 100755 index 0000000..c9e6006 --- /dev/null +++ b/src/dataset/load_dump.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +DB_URL="${DATABASE_URL:-postgresql://user:password@localhost:5432/dbname}" +DUMP_FILE="${1:-dump_data.sql}" + +if [ ! -f "$DUMP_FILE" ]; then + echo "Ошибка: файл $DUMP_FILE не найден" + exit 1 +fi + +echo "Импорт дампа из $DUMP_FILE в БД..." + +psql "$DB_URL" -f "$DUMP_FILE" + +echo "Импорт завершен!" + diff --git a/src/dataset/load_vacancies.py b/src/dataset/load_vacancies.py index d8c3260..3f14386 100755 --- a/src/dataset/load_vacancies.py +++ b/src/dataset/load_vacancies.py @@ -41,7 +41,7 @@ async def main() -> None: ml_container = make_ml_ioc(ml_configuration) csv_path = Path("filtered_vacancies.csv") - max_records = 1000 + max_records = 100_000 try: async with backend_container() as backend_request_container, ml_container() as ml_request_container: diff --git a/src/template_project/adapters/data_gateways/tables.py b/src/template_project/adapters/data_gateways/tables.py index 4c8b64c..9e654f9 100644 --- a/src/template_project/adapters/data_gateways/tables.py +++ b/src/template_project/adapters/data_gateways/tables.py @@ -22,7 +22,7 @@ from sqlalchemy.orm import registry from template_project.application.access_token.entity import AccessToken from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod -from template_project.application.common.enums import EducationGrade +from template_project.application.common.enums import EducationGrade, ExperienceType from template_project.application.notification_device.entity import NotificationDevice from template_project.application.resume.entity import ( Resume, diff --git a/src/template_project/adapters/data_gateways/vacancy.py b/src/template_project/adapters/data_gateways/vacancy.py index 02c30fc..d6cada6 100644 --- a/src/template_project/adapters/data_gateways/vacancy.py +++ b/src/template_project/adapters/data_gateways/vacancy.py @@ -16,11 +16,13 @@ class DefaultVacancyDataGateway(VacancyDataGateway): @override async def get_suitable(self, vector: list[float]) -> Sequence[SuitableVacancy]: + distance_expr = vacancy_embedding_table.c.vector.cosine_distance(vector) + similarity_expr = 1 - distance_expr statement = ( - select(Vacancy, label("resume_similarity", vacancy_embedding_table.c.vector.cosine_distance(vector))) + select(Vacancy, label("resume_similarity", similarity_expr)) .join(VacancyEmbedding, vacancy_embedding_table.c.vacancy_id == vacancy_table.c.id) - .where(vacancy_embedding_table.c.vector.cosine_distance(vector) > 0.5) - .order_by(vacancy_embedding_table.c.vector.cosine_distance(vector).asc()) + .where(similarity_expr >= 0.5) + .order_by(distance_expr.asc()) .limit(100) ) result = await self._session.execute(statement) diff --git a/src/template_project/adapters/generators/resume_prediction.py b/src/template_project/adapters/generators/resume_prediction.py index 9112081..f761cae 100644 --- a/src/template_project/adapters/generators/resume_prediction.py +++ b/src/template_project/adapters/generators/resume_prediction.py @@ -22,11 +22,11 @@ class DefaultResumePredictionGenerator(ResumePredictionGenerator): key_skills=resume.key_skills, suitable_vacancies=[ SuitableVacancyDs( - vacancy_id=str(suituble_vacancy.vacancy.id), - from_salary=suituble_vacancy.vacancy.from_salary, - to_salary=suituble_vacancy.vacancy.to_salary, - key_skills=suituble_vacancy.vacancy.key_skills, - resume_similarity=suituble_vacancy.resume_similarity, + vacancy_id=str(suitable_vacancy.vacancy.id), + from_salary=suitable_vacancy.vacancy.from_salary, + to_salary=suitable_vacancy.vacancy.to_salary, + key_skills=suitable_vacancy.vacancy.key_skills, + resume_similarity=suitable_vacancy.resume_similarity, ) for suitable_vacancy in suitable_vacancies ], diff --git a/uv.lock b/uv.lock index d8a4db3..b71ac33 100644 --- a/uv.lock +++ b/uv.lock @@ -2425,15 +2425,28 @@ backend = [ { name = "sqlalchemy" }, ] dev = [ + { name = "aioboto3" }, { name = "alembic" }, + { name = "argon2-cffi" }, { name = "bandit" }, { name = "codespell" }, { name = "coverage" }, + { name = "cryptography" }, { name = "dirty-equals" }, + { name = "firebase-admin" }, + { name = "httpx" }, { name = "mypy" }, + { name = "pgvector" }, + { name = "prometheus-fastapi-instrumentator" }, + { name = "psycopg", extra = ["binary"] }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "python-multipart" }, { name = "ruff" }, + { name = "sentence-transformers" }, + { name = "sqlalchemy" }, + { name = "torch", version = "2.2.2", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(python_full_version < '3.13' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'darwin')" }, + { name = "torch", version = "2.2.2+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(python_full_version >= '3.13' and sys_platform == 'darwin') or (python_full_version >= '3.13' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "types-cachetools" }, ] linters = [ @@ -2487,15 +2500,27 @@ backend = [ { name = "sqlalchemy", specifier = "==2.0.44" }, ] dev = [ + { name = "aioboto3", specifier = "==15.5.0" }, { name = "alembic", specifier = "==1.17.0" }, + { name = "argon2-cffi", specifier = "==23.1.0" }, { name = "bandit", specifier = "==1.8.6" }, { name = "codespell", specifier = "==2.4.1" }, { name = "coverage", specifier = "==7.11.0" }, + { name = "cryptography", specifier = "==46.0.3" }, { name = "dirty-equals", specifier = ">=0.11" }, + { name = "firebase-admin", specifier = ">=7.1.0" }, + { name = "httpx", specifier = "==0.28.1" }, { name = "mypy", specifier = "==1.18.1" }, + { name = "pgvector", specifier = ">=0.4.1" }, + { name = "prometheus-fastapi-instrumentator", specifier = ">=7.1.0" }, + { name = "psycopg", extras = ["binary"], specifier = ">=3.2.12" }, { name = "pytest", specifier = "==8.4.0" }, { name = "pytest-asyncio", specifier = "==1.2.0" }, + { name = "python-multipart", specifier = ">=0.0.20" }, { name = "ruff", specifier = "==0.12.11" }, + { name = "sentence-transformers", specifier = ">=5.1.2" }, + { name = "sqlalchemy", specifier = "==2.0.44" }, + { name = "torch", index = "https://download.pytorch.org/whl/cpu" }, { name = "types-cachetools", specifier = "==6.2.0.20250827" }, ] linters = [