diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0bcad88..5043c87 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,6 +6,9 @@ stages: - tag - deploy +default: + retry: 2 + variables: BASE_IMAGE_NAME: $CI_REGISTRY_IMAGE TRIVY_CACHE_DIR: .cache/trivy @@ -18,6 +21,8 @@ variables: UV_CACHE_DIR: .cache/uv BUILDAH_ISOLATION: oci STORAGE_DRIVER: vfs + DOCKER_HOST: "tcp://docker:2375" + DOCKER_TLS_CERTDIR: "" cache: key: "${CI_COMMIT_REF_SLUG}" @@ -27,6 +32,13 @@ cache: - $UV_PROJECT_ENVIRONMENT policy: pull-push +.docker-job: &docker-job + image: docker:28.5 + services: + - docker:28.5-dind + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + .buildah-job: &buildah-job image: quay.io/containers/buildah:latest variables: @@ -223,13 +235,13 @@ build-migrations: CONTAINERFILE: Containerfile BUILDTARGET: migrations -# build-ml: -# <<: *build-config -# when: manual -# variables: -# IMAGE_NAME: $BASE_IMAGE_NAME/ml -# CONTAINERFILE: Containerfile -# BUILDTARGET: ml +build-ml: + <<: *build-config + when: manual + variables: + IMAGE_NAME: $BASE_IMAGE_NAME/ml + CONTAINERFILE: Containerfile + BUILDTARGET: ml lint: <<: *uv-job @@ -246,19 +258,18 @@ lint: - if: $CI_COMMIT_TAG test: + <<: *docker-job stage: test tags: - - self-hosted + - beta variables: COMPOSE_PROFILES: | --profile migrations --profile tests - before_script: - - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" "$CI_REGISTRY" script: + - apk add --no-cache docker-compose - export PROFILES="$(printf '%s ' $COMPOSE_PROFILES)" - cp "$TEST_STAGE_FIREBASE_CONF" ./infrastructure/configs/backend/firebase.json - - | ( while true; do @@ -267,24 +278,23 @@ test: done ) | tee -a compose.log & - LOGS_PID=$! - - | REGISTRY_PREFIX=$CI_REGISTRY_IMAGE IMAGE_TAG=$CI_COMMIT_SHA \ docker compose -f compose.yaml -f compose.prod.yaml \ $PROFILES up -d --quiet-pull --quiet-build 2>&1 | tee compose.log - - | TEST_CONTAINER_ID=$(docker compose -f compose.yaml $PROFILES ps -q tests -a) - timeout 600 docker wait "$TEST_CONTAINER_ID" - TEST_EXIT_CODE=$(docker inspect --format "{{.State.ExitCode}}" "$TEST_CONTAINER_ID") + timeout 600 docker wait $TEST_CONTAINER_ID + TEST_EXIT_CODE=$(docker inspect --format "{{.State.ExitCode}}" $TEST_CONTAINER_ID) - if [ "$TEST_EXIT_CODE" -eq 0 ]; then + if [ $TEST_EXIT_CODE -eq 0 ]; then echo "Tests passed." else echo "Tests failed with exit code $TEST_EXIT_CODE." exit 1 fi - - docker compose -f compose.yaml $PROFILES down + - | + docker compose -f compose.yaml $PROFILES down - cat .cov/coverage.txt artifacts: paths: @@ -334,7 +344,6 @@ sast-image-migrations: # sast-image-ml: # <<: *trivy-image-scan -# when: manual # variables: # IMAGE_NAME: $BASE_IMAGE_NAME/ml # IMAGE_TYPE: ml @@ -356,11 +365,10 @@ tag-migrations: variables: IMAGE_NAME: $BASE_IMAGE_NAME/backend-migrations -# tag-ml: -# <<: *tag-config -# when: manual -# variables: -# IMAGE_NAME: $BASE_IMAGE_NAME/ml +tag-ml: + <<: *tag-config + variables: + IMAGE_NAME: $BASE_IMAGE_NAME/ml webhook-migrations-deploy: <<: *webhook-config @@ -385,6 +393,16 @@ webhook-backend-deploy: - build-runtime - sast-image-runtime +webhook-ml-deploy: + <<: *webhook-config + stage: deploy + variables: + WEBHOOK_URL: $WEBHOOK_URL_ML + resource_group: staging + dependencies: + - build-ml + # - sast-image-ml + workflow: rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" diff --git a/compose.yaml b/compose.yaml index 3b4997a..c39a6c7 100644 --- a/compose.yaml +++ b/compose.yaml @@ -64,7 +64,7 @@ services: ml: build: context: . - dockerfile: Containerfile.ml + dockerfile: Containerfile target: ml tags: - template-project-ml:latest diff --git a/src/template_project/application/resume/interactors/predict_salary.py b/src/template_project/application/resume/interactors/predict_salary.py new file mode 100644 index 0000000..dcd006c --- /dev/null +++ b/src/template_project/application/resume/interactors/predict_salary.py @@ -0,0 +1,208 @@ +from collections import defaultdict +from decimal import Decimal +from operator import itemgetter + +from Levenshtein import ratio + +from template_project.application.common.data_structure import to_data_structure +from template_project.application.common.interactor import to_interactor +from template_project.application.resume.entity import ResumeId + + +@to_data_structure +class VacancyInput: + vacancy_id: str + from_salary: Decimal + to_salary: Decimal + key_skills: list[str] + resume_similarity: float + + +@to_data_structure +class PredictSalaryRequest: + resume_id: ResumeId + key_skills: list[str] + vacancies: list[VacancyInput] + + +@to_data_structure +class PredictSalaryResponse: + salary_from: Decimal + salary_to: Decimal + recommended_skills: list[str] + + +@to_interactor +class PredictSalaryInteractor: + async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse: + salary_from, salary_to = self._predict_salary(request.vacancies, request.key_skills) + recommended_skills = self._recommend_skills(request.vacancies, request.key_skills) + + return PredictSalaryResponse( + salary_from=salary_from, + salary_to=salary_to, + recommended_skills=recommended_skills, + ) + + def _predict_salary(self, vacancies: list[VacancyInput], resume_skills: list[str]) -> tuple[Decimal, Decimal]: + if not vacancies: + return Decimal(50000), Decimal(80000) + + vacancy_weights: list[float] = [] + for vacancy in vacancies: + skills_similarity = self._calculate_skills_similarity(resume_skills, vacancy.key_skills) + vacancy_weight = 0.8 * vacancy.resume_similarity + 0.2 * skills_similarity + vacancy_weights.append(vacancy_weight) + + total_weight = sum(vacancy_weights) + if total_weight == 0: + return Decimal(50000), Decimal(80000) + + weighted_from_sum = Decimal(0) + weighted_to_sum = Decimal(0) + + for vacancy, weight in zip(vacancies, vacancy_weights, strict=False): + weighted_from_sum += vacancy.from_salary * Decimal(str(weight)) + weighted_to_sum += vacancy.to_salary * Decimal(str(weight)) + + predicted_from = weighted_from_sum / Decimal(str(total_weight)) + predicted_to = weighted_to_sum / Decimal(str(total_weight)) + + return predicted_from.quantize(Decimal("0.01")), predicted_to.quantize(Decimal("0.01")) + + def _recommend_skills( + self, + vacancies: list[VacancyInput], + resume_skills: list[str], + ) -> list[str]: + if not vacancies: + return [] + + skill_salaries, skill_frequencies = self._collect_skill_statistics(vacancies) + + filtered_skills = self._filter_skills_by_frequency(skill_frequencies, min_frequency=3) + + candidate_skills = self._filter_skills_by_resume_similarity(filtered_skills, resume_skills) + + if not candidate_skills: + return [] + + skill_scores = self._calculate_skill_scores(candidate_skills, skill_salaries, skill_frequencies) + + return self._get_top_skills(skill_scores, top_n=3) + + def _collect_skill_statistics( + self, vacancies: list[VacancyInput] + ) -> tuple[dict[str, list[Decimal]], dict[str, int]]: + skill_salaries: dict[str, list[Decimal]] = defaultdict(list) + skill_frequencies: dict[str, int] = defaultdict(int) + + for vacancy in vacancies: + avg_salary = (vacancy.from_salary + vacancy.to_salary) / Decimal(2) + + for skill in vacancy.key_skills: + normalized_skill = skill.lower().strip() + skill_salaries[normalized_skill].append(avg_salary) + skill_frequencies[normalized_skill] += 1 + + return skill_salaries, skill_frequencies + + def _filter_skills_by_frequency( + self, + skill_frequencies: dict[str, int], + min_frequency: int = 3, + ) -> set[str]: + return {skill for skill, frequency in skill_frequencies.items() if frequency >= min_frequency} + + def _filter_skills_by_resume_similarity( + self, + skills: set[str], + resume_skills: list[str], + ) -> list[str]: + resume_skills_normalized = {skill.lower().strip() for skill in resume_skills} + + candidate_skills: list[str] = [] + for skill in skills: + is_already_in_resume = any( + self._is_skill_similar(skill, resume_skill) for resume_skill in resume_skills_normalized + ) + if not is_already_in_resume: + candidate_skills.append(skill) + + return candidate_skills + + def _calculate_skill_scores( + self, + candidate_skills: list[str], + skill_salaries: dict[str, list[Decimal]], + skill_frequencies: dict[str, int], + ) -> list[tuple[str, float]]: + skill_avg_salaries: dict[str, Decimal] = { + skill: sum(salaries) / Decimal(str(len(salaries))) + for skill, salaries in skill_salaries.items() + if skill in candidate_skills + } + + frequencies = [skill_frequencies[skill] for skill in candidate_skills] + avg_salaries = [float(skill_avg_salaries[skill]) for skill in candidate_skills] + + min_freq = min(frequencies) + max_freq = max(frequencies) + min_salary = min(avg_salaries) + max_salary = max(avg_salaries) + + skill_scores: list[tuple[str, float]] = [] + for skill in candidate_skills: + normalized_freq = self._normalize(float(skill_frequencies[skill]), min_freq, max_freq) + normalized_salary = self._normalize(float(skill_avg_salaries[skill]), min_salary, max_salary) + score = normalized_freq + normalized_salary + skill_scores.append((skill, score)) + + return skill_scores + + def _get_top_skills(self, skill_scores: list[tuple[str, float]], top_n: int = 3) -> list[str]: + skill_scores.sort(key=itemgetter(1), reverse=True) + return [skill for skill, _ in skill_scores[:top_n]] + + def _normalize(self, value: float, min_val: float, max_val: float) -> float: + if max_val == min_val: + return 0.0 + return (value - min_val) / (max_val - min_val) + + def _is_skill_similar(self, skill1: str, skill2: str, threshold: float = 0.7) -> bool: + return ratio(skill1.lower().strip(), skill2.lower().strip()) >= threshold + + def _calculate_skills_similarity(self, resume_skills: list[str], vacancy_skills: list[str]) -> float: + if not resume_skills or not vacancy_skills: + return 0.0 + + resume_skills_normalized = {skill.lower().strip() for skill in resume_skills} + vacancy_skills_normalized = {skill.lower().strip() for skill in vacancy_skills} + + matched_resume_skills = set() + matched_vacancy_skills = set() + + for resume_skill in resume_skills_normalized: + best_match_ratio = 0.0 + best_match_skill = None + + for vacancy_skill in vacancy_skills_normalized: + if vacancy_skill in matched_vacancy_skills: + continue + + similarity_ratio = ratio(resume_skill, vacancy_skill) + if similarity_ratio > best_match_ratio: + best_match_ratio = similarity_ratio + best_match_skill = vacancy_skill + + if best_match_ratio >= 0.7 and best_match_skill is not None: + matched_resume_skills.add(resume_skill) + matched_vacancy_skills.add(best_match_skill) + + intersection_size = len(matched_resume_skills) + union_size = len(resume_skills_normalized | vacancy_skills_normalized) + + if union_size == 0: + return 0.0 + + return intersection_size / union_size diff --git a/src/template_project/ml/entry_point.py b/src/template_project/ml/entry_point.py index 110ebf7..5814523 100644 --- a/src/template_project/ml/entry_point.py +++ b/src/template_project/ml/entry_point.py @@ -14,7 +14,7 @@ from fastapi.middleware.cors import CORSMiddleware from template_project.ml.configuration import load_configuration from template_project.ml.ioc.make import make_ioc -from template_project.ml.routes import embedding, healthcheck, predict +from template_project.ml.routes import embed, healthcheck, predict LOG_CONFIG: Final = { "version": 1, @@ -55,7 +55,7 @@ def make_asgi_application( allow_headers=["*"], ) app.include_router(healthcheck.router) - app.include_router(embedding.router) + app.include_router(embed.router) app.include_router(predict.router) setup_dishka(container=ioc, app=app) diff --git a/src/template_project/ml/interactors/predict_salary.py b/src/template_project/ml/interactors/predict_salary.py deleted file mode 100644 index 0775113..0000000 --- a/src/template_project/ml/interactors/predict_salary.py +++ /dev/null @@ -1,38 +0,0 @@ -from decimal import Decimal - -from template_project.application.common.data_structure import to_data_structure -from template_project.application.common.interactor import to_interactor -from template_project.application.resume.entity import ResumeId - - -@to_data_structure -class VacancyInput: - vacancy_id: str - from_salary: Decimal - to_salary: Decimal - key_skills: list[str] - resume_similarity: float - - -@to_data_structure -class PredictSalaryRequest: - resume_id: ResumeId - key_skills: list[str] - vacancies: list[VacancyInput] - - -@to_data_structure -class PredictSalaryResponse: - salary_from: Decimal - salary_to: Decimal - recommended_skills: list[str] - - -@to_interactor -class PredictSalaryInteractor: - async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse: - return PredictSalaryResponse( - salary_from=Decimal(50000), - salary_to=Decimal(80000), - recommended_skills=["python", "django", "postgresql"], - ) diff --git a/src/template_project/ml/ioc/interactor.py b/src/template_project/ml/ioc/interactor.py index c56095d..6092b5a 100644 --- a/src/template_project/ml/ioc/interactor.py +++ b/src/template_project/ml/ioc/interactor.py @@ -1,6 +1,6 @@ from dishka import BaseScope, Provider, Scope, provide_all -from template_project.ml.interactors.predict_salary import PredictSalaryInteractor +from template_project.application.resume.interactors.predict_salary import PredictSalaryInteractor class InteractorProvider(Provider): diff --git a/src/template_project/ml/routes/embedding.py b/src/template_project/ml/routes/embed.py similarity index 100% rename from src/template_project/ml/routes/embedding.py rename to src/template_project/ml/routes/embed.py diff --git a/src/template_project/ml/routes/predict.py b/src/template_project/ml/routes/predict.py index 8db98ef..3a15592 100644 --- a/src/template_project/ml/routes/predict.py +++ b/src/template_project/ml/routes/predict.py @@ -6,7 +6,7 @@ from fastapi import APIRouter from pydantic import BaseModel, Field from template_project.application.resume.entity import ResumeId -from template_project.ml.interactors.predict_salary import ( +from template_project.application.resume.interactors.predict_salary import ( PredictSalaryInteractor, PredictSalaryRequest, VacancyInput, @@ -82,14 +82,17 @@ class PredictSalaryResponseModel(BaseModel): @router.post( - "/predict_salary", - summary="Predict salary", + "/predict", + summary="Predict salary and recommend skills", description="Predict salary range and recommend skills based on resume and relevant vacancies", responses={ - 200: {"description": "Salary prediction generated successfully", "model": PredictSalaryResponseModel}, + 200: { + "description": "Salary prediction and skills recommendation generated successfully", + "model": PredictSalaryResponseModel, + }, }, ) -async def predict_salary( +async def predict( request: PredictSalaryRequestModel, interactor: FromDishka[PredictSalaryInteractor], ) -> PredictSalaryResponseModel: diff --git a/uv.lock b/uv.lock index 1d0a05a..d8a4db3 100644 --- a/uv.lock +++ b/uv.lock @@ -1438,29 +1438,14 @@ wheels = [ [[package]] name = "numpy" -version = "2.1.2" +version = "1.26.3" source = { registry = "https://download.pytorch.org/whl/cpu" } wheels = [ - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7bf0a4f9f15b32b5ba53147369e94296f5fffb783db5aacc1be15b4bf72f43b" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b1d0fcae4f0949f215d4632be684a539859b295e2d0cb14f78ec231915d644db" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f751ed0a2f250541e19dfca9f1eafa31a392c71c832b6bb9e113b10d050cb0f1" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:bd33f82e95ba7ad632bc57837ee99dba3d7e006536200c4e9124089e1bf42426" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b8cde4f11f0a975d1fd59373b32e2f5a562ade7cde4f85b7137f3de8fbb29a0" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d95f286b8244b3649b477ac066c6906fbb2905f8ac19b170e2175d3d799f4df" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:456e3b11cb79ac9946c822a56346ec80275eaf2950314b249b512896c0d2505e" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a84498e0d0a1174f2b3ed769b67b656aa5460c92c9554039e11f20a05650f00d" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4d6ec0d4222e8ffdab1744da2560f07856421b367928026fb540e1945f2eeeaf" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:259ec80d54999cc34cd1eb8ded513cb053c3bf4829152a2e00de2371bd406f5e" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:675c741d4739af2dc20cd6c6a5c4b7355c728167845e3c6b0e824e4e5d36a6c3" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b2d4e667895cc55e3ff2b56077e4c8a5604361fc21a042845ea3ad67465aa8" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43cca367bf94a14aca50b89e9bc2061683116cfe864e56740e083392f533ce7a" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f2ded8d9b6f68cc26f8425eda5d3877b47343e68ca23d0d0846f4d312ecaa445" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2ffef621c14ebb0188a8633348504a35c13680d6da93ab5cb86f4e54b7e922b5" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ad369ed238b1959dfbade9018a740fb9392c5ac4f9b5173f420bd4f37ba1f7a0" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d82075752f40c0ddf57e6e02673a17f6cb0f8eb3f587f63ca1eaab5594da5b17" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:1600068c262af1ca9580a527d43dc9d959b0b1d8e56f8a05d830eea39b7c8af6" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a26ae94658d3ba3781d5e103ac07a876b3e9b29db53f68ed7df432fd033358a8" }, - { url = "https://download.pytorch.org/whl/numpy-2.1.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13311c2db4c5f7609b462bc0f43d3c465424d25c626d95040f073e30f7570e35" }, + { url = "https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7081fd19a6d573e1a05e600c82a1c421011db7935ed0d5c483e9dd96b99cf13" }, + { url = "https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12c70ac274b32bc00c7f61b515126c9205323703abb99cd41836e8125ea0043e" }, + { url = "https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f784e13e598e9594750b2ef6729bcd5a47f6cfe4a12cca13def35e06d8163e3" }, + { url = "https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f24750ef94d56ce6e33e4019a8a4d68cfdb1ef661a52cdaee628a56d2437419" }, + { url = "https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-win_amd64.whl", hash = "sha256:da4b0c6c699a0ad73c810736303f7fbae483bcb012e38d7eb06a5e3b432c981b" }, ] [[package]]