Skip to content

Commit e20e7c1

Browse files
authoredMar 25, 2025··
Merge pull request #1644 from qodo-ai/es/help_docs
Adding a new tool: /help_docs
2 parents 1aab875 + b161672 commit e20e7c1

17 files changed

+898
-16
lines changed
 

‎docs/docs/tools/help_docs.md

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
## Overview
2+
3+
The `help_docs` tool answers a question based on a given relative path of documentation, either from the repository of this merge request or from a given one.
4+
It can be invoked manually by commenting on any PR:
5+
```
6+
/help_docs "..."
7+
```
8+
9+
## Example usage
10+
11+
![help_docs on the documentation of this repository](https://codium.ai/images/pr_agent/help_docs_comment.png){width=512}
12+
13+
![help_docs on the documentation of another repository](https://codium.ai/images/pr_agent/help_docs_comment_explicit_git.png){width=512}
14+
15+
![help_docs response](https://codium.ai/images/pr_agent/help_docs_response.png){width=512}
16+
17+
## Configuration options
18+
19+
Under the section `--pr_help_docs`, the [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L50) contains options to customize the 'help docs' tool:
20+
21+
- `repo_url`: If not overwritten, will use the repo from where the context came from (issue or PR), otherwise - use the given repo as context.
22+
- `repo_default_branch`: The branch to use in case repo_url overwritten, otherwise - has no effect.
23+
- `docs_path`: Relative path from root of repository (either the one this PR has been issued for, or above repo url).
24+
- `exclude_root_readme`: Whether or not to exclude the root README file for querying the model.
25+
- `supported_doc_exts` : Which file extensions should be included for the purpose of querying the model.

‎docs/mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ nav:
2828
- Improve: 'tools/improve.md'
2929
- Ask: 'tools/ask.md'
3030
- Update Changelog: 'tools/update_changelog.md'
31+
- Help Docs: 'tools/help_docs.md'
3132
- Help: 'tools/help.md'
3233
- 💎 Analyze: 'tools/analyze.md'
3334
- 💎 Test: 'tools/test.md'

‎pr_agent/agent/pr_agent.py

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pr_agent.tools.pr_config import PRConfig
1414
from pr_agent.tools.pr_description import PRDescription
1515
from pr_agent.tools.pr_generate_labels import PRGenerateLabels
16+
from pr_agent.tools.pr_help_docs import PRHelpDocs
1617
from pr_agent.tools.pr_help_message import PRHelpMessage
1718
from pr_agent.tools.pr_line_questions import PR_LineQuestions
1819
from pr_agent.tools.pr_questions import PRQuestions
@@ -39,6 +40,7 @@
3940
"similar_issue": PRSimilarIssue,
4041
"add_docs": PRAddDocs,
4142
"generate_labels": PRGenerateLabels,
43+
"help_docs": PRHelpDocs,
4244
}
4345

4446
commands = list(command2class.keys())

‎pr_agent/algo/token_handler.py

+49-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from threading import Lock
22

33
from jinja2 import Environment, StrictUndefined
4+
from math import ceil
45
from tiktoken import encoding_for_model, get_encoding
56

67
from pr_agent.config_loader import get_settings
@@ -76,7 +77,35 @@ def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user):
7677
get_logger().error(f"Error in _get_system_user_tokens: {e}")
7778
return 0
7879

79-
def count_tokens(self, patch: str) -> int:
80+
def calc_claude_tokens(self, patch):
81+
try:
82+
import anthropic
83+
from pr_agent.algo import MAX_TOKENS
84+
client = anthropic.Anthropic(api_key=get_settings(use_context=False).get('anthropic.key'))
85+
MaxTokens = MAX_TOKENS[get_settings().config.model]
86+
87+
# Check if the content size is too large (9MB limit)
88+
if len(patch.encode('utf-8')) > 9_000_000:
89+
get_logger().warning(
90+
"Content too large for Anthropic token counting API, falling back to local tokenizer"
91+
)
92+
return MaxTokens
93+
94+
response = client.messages.count_tokens(
95+
model="claude-3-7-sonnet-20250219",
96+
system="system",
97+
messages=[{
98+
"role": "user",
99+
"content": patch
100+
}],
101+
)
102+
return response.input_tokens
103+
104+
except Exception as e:
105+
get_logger().error( f"Error in Anthropic token counting: {e}")
106+
return MaxTokens
107+
108+
def count_tokens(self, patch: str, force_accurate=False) -> int:
80109
"""
81110
Counts the number of tokens in a given patch string.
82111
@@ -86,4 +115,22 @@ def count_tokens(self, patch: str) -> int:
86115
Returns:
87116
The number of tokens in the patch string.
88117
"""
89-
return len(self.encoder.encode(patch, disallowed_special=()))
118+
encoder_estimate = len(self.encoder.encode(patch, disallowed_special=()))
119+
if not force_accurate:
120+
return encoder_estimate
121+
#else, need to provide an accurate estimation:
122+
123+
model = get_settings().config.model.lower()
124+
if force_accurate and 'claude' in model and get_settings(use_context=False).get('anthropic.key'):
125+
return self.calc_claude_tokens(patch) # API call to Anthropic for accurate token counting for Claude models
126+
#else: Non Anthropic provided model
127+
128+
import re
129+
model_is_from_o_series = re.match(r"^o[1-9](-mini|-preview)?$", model)
130+
if ('gpt' in get_settings().config.model.lower() or model_is_from_o_series) and get_settings(use_context=False).get('openai.key'):
131+
return encoder_estimate
132+
#else: Model is neither an OpenAI, nor an Anthropic model - therefore, cannot provide an accurate token count and instead, return a higher number as best effort.
133+
134+
elbow_factor = 1 + get_settings().get('config.model_token_count_estimate_factor', 0)
135+
get_logger().warning(f"{model}'s expected token count cannot be accurately estimated. Using {elbow_factor} of encoder output as best effort estimate")
136+
return ceil(elbow_factor * encoder_estimate)

‎pr_agent/cli.py

+3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def set_parser():
2222
- cli.py --pr_url=... ask "write me a poem about this PR"
2323
- cli.py --pr_url=... reflect
2424
- cli.py --issue_url=... similar_issue
25+
- cli.py --pr_url/--issue_url= help_docs [<asked question>]
2526
2627
Supported commands:
2728
- review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement.
@@ -40,6 +41,8 @@ def set_parser():
4041
- add_docs
4142
4243
- generate_labels
44+
45+
- help_docs - Ask a question, from either an issue or PR context, on a given repo (current context or a different one)
4346
4447
4548
Configuration:

‎pr_agent/config_loader.py

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"settings/pr_add_docs.toml",
2929
"settings/custom_labels.toml",
3030
"settings/pr_help_prompts.toml",
31+
"settings/pr_help_docs_prompts.toml",
3132
"settings/.secrets.toml",
3233
"settings_prod/.secrets.toml",
3334
]]

‎pr_agent/git_providers/bitbucket_provider.py

+55-3
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,15 @@ def __init__(
3030
):
3131
s = requests.Session()
3232
try:
33-
bearer = context.get("bitbucket_bearer_token", None)
33+
self.bearer_token = bearer = context.get("bitbucket_bearer_token", None)
34+
if not bearer and get_settings().get("BITBUCKET.BEARER_TOKEN", None):
35+
self.bearer_token = bearer = get_settings().get("BITBUCKET.BEARER_TOKEN", None)
3436
s.headers["Authorization"] = f"Bearer {bearer}"
3537
except Exception:
38+
self.bearer_token = get_settings().get("BITBUCKET.BEARER_TOKEN", None)
3639
s.headers[
3740
"Authorization"
38-
] = f'Bearer {get_settings().get("BITBUCKET.BEARER_TOKEN", None)}'
41+
] = f'Bearer {self.bearer_token}'
3942
s.headers["Content-Type"] = "application/json"
4043
self.headers = s.headers
4144
self.bitbucket_client = Cloud(session=s)
@@ -67,6 +70,37 @@ def get_repo_settings(self):
6770
except Exception:
6871
return ""
6972

73+
def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket does not support issue url, so ignore param
74+
try:
75+
parsed_url = urlparse(self.pr_url)
76+
return f"{parsed_url.scheme}://{parsed_url.netloc}/{self.workspace_slug}/{self.repo_slug}.git"
77+
except Exception as e:
78+
get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}")
79+
return ""
80+
81+
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo.
82+
# Example: git clone git clone https://bitbucket.org/codiumai/pr-agent.git and branch: main -> prefix: "https://bitbucket.org/codiumai/pr-agent/src/main", suffix: ""
83+
# In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix.
84+
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
85+
scheme_and_netloc = None
86+
if repo_git_url:
87+
parsed_git_url = urlparse(repo_git_url)
88+
scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc
89+
repo_path = parsed_git_url.path.split('.git')[0][1:] #/<workspace>/<repo>.git -> <workspace>/<repo>
90+
if repo_path.count('/') != 1:
91+
get_logger().error(f"repo_git_url is not a valid git repo url: {repo_git_url}")
92+
return ("", "")
93+
workspace_name, project_name = repo_path.split('/')
94+
else:
95+
desired_branch = self.get_pr_branch()
96+
parsed_pr_url = urlparse(self.pr_url)
97+
scheme_and_netloc = parsed_pr_url.scheme + "://" + parsed_pr_url.netloc
98+
workspace_name, project_name = (self.workspace_slug, self.repo_slug)
99+
prefix = f"{scheme_and_netloc}/{workspace_name}/{project_name}/src/{desired_branch}"
100+
suffix = "" #None
101+
return (prefix, suffix)
102+
103+
70104
def publish_code_suggestions(self, code_suggestions: list) -> bool:
71105
"""
72106
Publishes code suggestions as comments on the PR.
@@ -457,7 +491,7 @@ def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool:
457491
return True
458492

459493
@staticmethod
460-
def _parse_pr_url(pr_url: str) -> Tuple[str, int]:
494+
def _parse_pr_url(pr_url: str) -> Tuple[str, int, int]:
461495
parsed_url = urlparse(pr_url)
462496

463497
if "bitbucket.org" not in parsed_url.netloc:
@@ -559,3 +593,21 @@ def publish_labels(self, pr_types: list):
559593
# bitbucket does not support labels
560594
def get_pr_labels(self, update=False):
561595
pass
596+
#Clone related
597+
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
598+
if "bitbucket.org" not in repo_url_to_clone:
599+
get_logger().error("Repo URL is not a valid bitbucket URL.")
600+
return None
601+
bearer_token = self.bearer_token
602+
if not bearer_token:
603+
get_logger().error("No bearer token provided. Returning None")
604+
return None
605+
606+
#For example: For repo: https://bitbucket.org/codiumai/pr-agent-tests.git
607+
#clone url will be: https://x-token-auth:<token>@bitbucket.org/codiumai/pr-agent-tests.git
608+
(scheme, base_url) = repo_url_to_clone.split("bitbucket.org")
609+
if not all([scheme, base_url]):
610+
get_logger().error(f"repo_url_to_clone: {repo_url_to_clone} is not a valid bitbucket URL.")
611+
return None
612+
clone_url = f"{scheme}x-token-auth:{bearer_token}@bitbucket.org{base_url}"
613+
return clone_url

‎pr_agent/git_providers/bitbucket_server_provider.py

+57-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
from atlassian.bitbucket import Bitbucket
99
from requests.exceptions import HTTPError
10+
import shlex
11+
import subprocess
1012

1113
from ..algo.git_patch_processing import decode_if_bytes
1214
from ..algo.language_handler import is_valid_file
@@ -34,7 +36,7 @@ def __init__(
3436
self.incremental = incremental
3537
self.diff_files = None
3638
self.bitbucket_pull_request_api_url = pr_url
37-
39+
self.bearer_token = get_settings().get("BITBUCKET_SERVER.BEARER_TOKEN", None)
3840
self.bitbucket_server_url = self._parse_bitbucket_server(url=pr_url)
3941
self.bitbucket_client = bitbucket_client or Bitbucket(url=self.bitbucket_server_url,
4042
token=get_settings().get("BITBUCKET_SERVER.BEARER_TOKEN",
@@ -47,6 +49,35 @@ def __init__(
4749
if pr_url:
4850
self.set_pr(pr_url)
4951

52+
def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket server does not support issue url, so ignore param
53+
try:
54+
parsed_url = urlparse(self.pr_url)
55+
return f"{parsed_url.scheme}://{parsed_url.netloc}/scm/{self.workspace_slug.lower()}/{self.repo_slug.lower()}.git"
56+
except Exception as e:
57+
get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}")
58+
return ""
59+
60+
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo.
61+
# Example: https://bitbucket.dev.my_inc.com/scm/my_work/my_repo.git and branch: my_branch -> prefix: "https://bitbucket.dev.my_inc.com/projects/MY_WORK/repos/my_repo/browse/src", suffix: "?at=refs%2Fheads%2Fmy_branch"
62+
# In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix.
63+
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
64+
workspace_name = None
65+
project_name = None
66+
if not repo_git_url:
67+
desired_branch = self.get_pr_branch()
68+
workspace_name = self.workspace_slug
69+
project_name = self.repo_slug
70+
elif '.git' in repo_git_url and 'scm/' in repo_git_url:
71+
repo_path = repo_git_url.split('.git')[0].split('scm/')[-1]
72+
if repo_path.count('/') == 1: # Has to have the form <workspace>/<repo>
73+
workspace_name, project_name = repo_path.split('/')
74+
if not workspace_name or not project_name:
75+
get_logger().error(f"workspace_name or project_name not found in context, either git url: {repo_git_url} or uninitialized workspace/project.")
76+
return ("", "")
77+
prefix = f"{self.bitbucket_server_url}/projects/{workspace_name}/repos/{project_name}/browse"
78+
suffix = f"?at=refs%2Fheads%2F{desired_branch}"
79+
return (prefix, suffix)
80+
5081
def get_repo_settings(self):
5182
try:
5283
content = self.bitbucket_client.get_content_of_file(self.workspace_slug, self.repo_slug, ".pr_agent.toml", self.get_pr_branch())
@@ -481,3 +512,28 @@ def _get_pr_comments_path(self):
481512

482513
def _get_merge_base(self):
483514
return f"rest/api/latest/projects/{self.workspace_slug}/repos/{self.repo_slug}/pull-requests/{self.pr_num}/merge-base"
515+
# Clone related
516+
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
517+
if 'bitbucket.' not in repo_url_to_clone:
518+
get_logger().error("Repo URL is not a valid bitbucket URL.")
519+
return None
520+
bearer_token = self.bearer_token
521+
if not bearer_token:
522+
get_logger().error("No bearer token provided. Returning None")
523+
return None
524+
# Return unmodified URL as the token is passed via HTTP headers in _clone_inner, as seen below.
525+
return repo_url_to_clone
526+
527+
#Overriding the shell command, since for some reason usage of x-token-auth doesn't work, as mentioned here:
528+
# https://stackoverflow.com/questions/56760396/cloning-bitbucket-server-repo-with-access-tokens
529+
def _clone_inner(self, repo_url: str, dest_folder: str, operation_timeout_in_seconds: int=None):
530+
bearer_token = self.bearer_token
531+
if not bearer_token:
532+
#Shouldn't happen since this is checked in _prepare_clone, therefore - throwing an exception.
533+
raise RuntimeError(f"Bearer token is required!")
534+
535+
cli_args = shlex.split(f"git clone -c http.extraHeader='Authorization: Bearer {bearer_token}' "
536+
f"--filter=blob:none --depth 1 {repo_url} {dest_folder}")
537+
538+
subprocess.run(cli_args, check=True, # check=True will raise an exception if the command fails
539+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=operation_timeout_in_seconds)

‎pr_agent/git_providers/git_provider.py

+73-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from abc import ABC, abstractmethod
22
# enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED)
3-
from typing import Optional
3+
import os
4+
import shutil
5+
import subprocess
6+
from typing import Optional, Tuple
47

58
from pr_agent.algo.types import FilePatchInfo
69
from pr_agent.algo.utils import Range, process_description
@@ -14,6 +17,75 @@ class GitProvider(ABC):
1417
def is_supported(self, capability: str) -> bool:
1518
pass
1619

20+
#Given a url (issues or PR/MR) - get the .git repo url to which they belong. Needs to be implemented by the provider.
21+
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
22+
get_logger().warning("Not implemented! Returning empty url")
23+
return ""
24+
25+
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. Needs to be implemented by the provider.
26+
# For example: For a git: https://git_provider.com/MY_PROJECT/MY_REPO.git and desired branch: <MY_BRANCH> then it should return ('https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO/.../<MY_BRANCH>', '?=<SOME HEADER>')
27+
# so that to properly view the file: docs/readme.md -> <PREFIX>/docs/readme.md<SUFFIX> -> https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO/<MY_BRANCH>/docs/readme.md?=<SOME HEADER>)
28+
def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]:
29+
get_logger().warning("Not implemented! Returning empty prefix and suffix")
30+
return ("", "")
31+
32+
33+
#Clone related API
34+
#An object which ensures deletion of a cloned repo, once it becomes out of scope.
35+
# Example usage:
36+
# with TemporaryDirectory() as tmp_dir:
37+
# returned_obj: GitProvider.ScopedClonedRepo = self.git_provider.clone(self.repo_url, tmp_dir, remove_dest_folder=False)
38+
# print(returned_obj.path) #Use returned_obj.path.
39+
# #From this point, returned_obj.path may be deleted at any point and therefore must not be used.
40+
class ScopedClonedRepo(object):
41+
def __init__(self, dest_folder):
42+
self.path = dest_folder
43+
44+
def __del__(self):
45+
if self.path and os.path.exists(self.path):
46+
shutil.rmtree(self.path, ignore_errors=True)
47+
48+
#Method to allow implementors to manipulate the repo url to clone (such as embedding tokens in the url string). Needs to be implemented by the provider.
49+
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
50+
get_logger().warning("Not implemented! Returning None")
51+
return None
52+
53+
# Does a shallow clone, using a forked process to support a timeout guard.
54+
# In case operation has failed, it is expected to throw an exception as this method does not return a value.
55+
def _clone_inner(self, repo_url: str, dest_folder: str, operation_timeout_in_seconds: int=None) -> None:
56+
#The following ought to be equivalent to:
57+
# #Repo.clone_from(repo_url, dest_folder)
58+
# , but with throwing an exception upon timeout.
59+
# Note: This can only be used in context that supports using pipes.
60+
subprocess.run([
61+
"git", "clone",
62+
"--filter=blob:none",
63+
"--depth", "1",
64+
repo_url, dest_folder
65+
], check=True, # check=True will raise an exception if the command fails
66+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=operation_timeout_in_seconds)
67+
68+
CLONE_TIMEOUT_SEC = 20
69+
# Clone a given url to a destination folder. If successful, returns an object that wraps the destination folder,
70+
# deleting it once it is garbage collected. See: GitProvider.ScopedClonedRepo for more details.
71+
def clone(self, repo_url_to_clone: str, dest_folder: str, remove_dest_folder: bool = True,
72+
operation_timeout_in_seconds: int=CLONE_TIMEOUT_SEC) -> ScopedClonedRepo|None:
73+
returned_obj = None
74+
clone_url = self._prepare_clone_url_with_token(repo_url_to_clone)
75+
if not clone_url:
76+
get_logger().error("Clone failed: Unable to obtain url to clone.")
77+
return returned_obj
78+
try:
79+
if remove_dest_folder and os.path.exists(dest_folder) and os.path.isdir(dest_folder):
80+
shutil.rmtree(dest_folder)
81+
self._clone_inner(clone_url, dest_folder, operation_timeout_in_seconds)
82+
returned_obj = GitProvider.ScopedClonedRepo(dest_folder)
83+
except Exception as e:
84+
get_logger().exception(f"Clone failed: Could not clone url.",
85+
artifact={"error": str(e), "url": clone_url, "dest_folder": dest_folder})
86+
finally:
87+
return returned_obj
88+
1789
@abstractmethod
1890
def get_files(self) -> list:
1991
pass

‎pr_agent/git_providers/github_provider.py

+98-7
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,60 @@ def get_incremental_commits(self, incremental=IncrementalPR(False)):
6363
def is_supported(self, capability: str) -> bool:
6464
return True
6565

66+
def _get_owner_and_repo_path(self, given_url: str) -> str:
67+
try:
68+
repo_path = None
69+
if 'issues' in given_url:
70+
repo_path, _ = self._parse_issue_url(given_url)
71+
elif 'pull' in given_url:
72+
repo_path, _ = self._parse_pr_url(given_url)
73+
elif given_url.endswith('.git'):
74+
parsed_url = urlparse(given_url)
75+
repo_path = (parsed_url.path.split('.git')[0])[1:] # /<owner>/<repo>.git -> <owner>/<repo>
76+
if not repo_path:
77+
get_logger().error(f"url is neither an issues url nor a pr url nor a valid git url: {given_url}. Returning empty result.")
78+
return ""
79+
return repo_path
80+
except Exception as e:
81+
get_logger().exception(f"unable to parse url: {given_url}. Returning empty result.")
82+
return ""
83+
84+
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
85+
repo_path = self._get_owner_and_repo_path(issues_or_pr_url)
86+
if not repo_path or repo_path not in issues_or_pr_url:
87+
get_logger().error(f"Unable to retrieve owner/path from url: {issues_or_pr_url}")
88+
return ""
89+
return f"{issues_or_pr_url.split(repo_path)[0]}{repo_path}.git"
90+
91+
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo.
92+
# Example: https://github.com/qodo-ai/pr-agent.git and branch: v0.8 -> prefix: "https://github.com/qodo-ai/pr-agent/blob/v0.8", suffix: ""
93+
# In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix.
94+
def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]:
95+
owner = None
96+
repo = None
97+
scheme_and_netloc = None
98+
99+
if repo_git_url: #If user provided an external git url, which may be different than what this provider was initialized with, we cannot use self.repo
100+
repo_path = self._get_owner_and_repo_path(repo_git_url)
101+
parsed_git_url = urlparse(repo_git_url)
102+
scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc
103+
if repo_path.count('/') == 1: #Has to have the form <owner>/<repo>
104+
owner, repo = repo_path.split('/')
105+
else:
106+
get_logger().error(f"Invalid repo_path: {repo_path} from repo_git_url: {repo_git_url}")
107+
return ("", "")
108+
if (not owner or not repo) and self.repo: #"else" - User did not provide an external git url, use self.repo object:
109+
owner, repo = self.repo.split('/')
110+
scheme_and_netloc = self.base_url_html
111+
desired_branch = self.get_pr_branch()
112+
if not any([scheme_and_netloc, owner, repo]): #"else": Not invoked from a PR context,but no provided git url for context
113+
get_logger().error(f"Unable to get canonical url parts since missing context (PR or explicit git url)")
114+
return ("", "")
115+
116+
prefix = f"{scheme_and_netloc}/{owner}/{repo}/blob/{desired_branch}"
117+
suffix = "" # github does not add a suffix
118+
return (prefix, suffix)
119+
66120
def get_pr_url(self) -> str:
67121
return self.pr.html_url
68122

@@ -703,9 +757,9 @@ def _parse_issue_url(self, issue_url: str) -> Tuple[str, int]:
703757
return repo_name, issue_number
704758

705759
def _get_github_client(self):
706-
deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user")
707-
708-
if deployment_type == 'app':
760+
self.deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user")
761+
self.auth = None
762+
if self.deployment_type == 'app':
709763
try:
710764
private_key = get_settings().github.private_key
711765
app_id = get_settings().github.app_id
@@ -715,16 +769,19 @@ def _get_github_client(self):
715769
raise ValueError("GitHub app installation ID is required when using GitHub app deployment")
716770
auth = AppAuthentication(app_id=app_id, private_key=private_key,
717771
installation_id=self.installation_id)
718-
return Github(app_auth=auth, base_url=self.base_url)
719-
720-
if deployment_type == 'user':
772+
self.auth = auth
773+
elif self.deployment_type == 'user':
721774
try:
722775
token = get_settings().github.user_token
723776
except AttributeError as e:
724777
raise ValueError(
725778
"GitHub token is required when using user deployment. See: "
726779
"https://github.com/Codium-ai/pr-agent#method-2-run-from-source") from e
727-
return Github(auth=Auth.Token(token), base_url=self.base_url)
780+
self.auth = Auth.Token(token)
781+
if self.auth:
782+
return Github(auth=self.auth, base_url=self.base_url)
783+
else:
784+
raise ValueError("Could not authenticate to GitHub")
728785

729786
def _get_repo(self):
730787
if hasattr(self, 'repo_obj') and \
@@ -1064,3 +1121,37 @@ def validate_comments_inside_hunks(self, code_suggestions):
10641121
get_logger().error(f"Failed to process patch for committable comment, error: {e}")
10651122
return code_suggestions_copy
10661123

1124+
#Clone related
1125+
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
1126+
scheme = "https://"
1127+
1128+
#For example, to clone:
1129+
#https://github.com/Codium-ai/pr-agent-pro.git
1130+
#Need to embed inside the github token:
1131+
#https://<token>@github.com/Codium-ai/pr-agent-pro.git
1132+
1133+
github_token = self.auth.token
1134+
github_base_url = self.base_url_html
1135+
if not all([github_token, github_base_url]):
1136+
get_logger().error("Either missing auth token or missing base url")
1137+
return None
1138+
if scheme not in github_base_url:
1139+
get_logger().error(f"Base url: {github_base_url} is missing prefix: {scheme}")
1140+
return None
1141+
github_com = github_base_url.split(scheme)[1] # e.g. 'github.com' or github.<org>.com
1142+
if not github_com:
1143+
get_logger().error(f"Base url: {github_base_url} has an empty base url")
1144+
return None
1145+
if github_com not in repo_url_to_clone:
1146+
get_logger().error(f"url to clone: {repo_url_to_clone} does not contain {github_com}")
1147+
return None
1148+
repo_full_name = repo_url_to_clone.split(github_com)[-1]
1149+
if not repo_full_name:
1150+
get_logger().error(f"url to clone: {repo_url_to_clone} is malformed")
1151+
return None
1152+
1153+
clone_url = scheme
1154+
if self.deployment_type == 'app':
1155+
clone_url += "git:"
1156+
clone_url += f"{github_token}@{github_com}{repo_full_name}"
1157+
return clone_url

‎pr_agent/git_providers/gitlab_provider.py

+58
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,43 @@ def is_supported(self, capability: str) -> bool:
5757
return False
5858
return True
5959

60+
def _get_project_path_from_pr_or_issue_url(self, pr_or_issue_url: str) -> str:
61+
repo_project_path = None
62+
if 'issues' in pr_or_issue_url:
63+
#replace 'issues' with 'merge_requests', since gitlab provider does not support issue urls, just to get the git repo url:
64+
pr_or_issue_url = pr_or_issue_url.replace('issues', 'merge_requests')
65+
if 'merge_requests' in pr_or_issue_url:
66+
repo_project_path, _ = self._parse_merge_request_url(pr_or_issue_url)
67+
if not repo_project_path:
68+
get_logger().error(f"url is not a valid merge requests url: {pr_or_issue_url}")
69+
return ""
70+
return repo_project_path
71+
72+
def get_git_repo_url(self, issues_or_pr_url: str) -> str:
73+
provider_url = issues_or_pr_url
74+
repo_path = self._get_project_path_from_pr_or_issue_url(provider_url)
75+
if not repo_path or repo_path not in issues_or_pr_url:
76+
get_logger().error(f"Unable to retrieve project path from url: {issues_or_pr_url}")
77+
return ""
78+
return f"{issues_or_pr_url.split(repo_path)[0]}{repo_path}.git"
79+
80+
# Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo.
81+
# Example: https://gitlab.com/codiumai/pr-agent.git and branch: t1 -> prefix: "https://gitlab.com/codiumai/pr-agent/-/blob/t1", suffix: "?ref_type=heads"
82+
# In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix.
83+
def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]:
84+
repo_path = ""
85+
if not repo_git_url and not self.pr_url:
86+
get_logger().error("Cannot get canonical URL parts: missing either context PR URL or a repo GIT URL")
87+
return ("", "")
88+
if not repo_git_url: #Use PR url as context
89+
repo_path = self._get_project_path_from_pr_or_issue_url(self.pr_url)
90+
desired_branch = self.get_pr_branch()
91+
else: #Use repo git url
92+
repo_path = repo_git_url.split('.git')[0].split('.com/')[-1]
93+
prefix = f"{self.gitlab_url}/{repo_path}/-/blob/{desired_branch}"
94+
suffix = "?ref_type=heads" # gitlab cloud adds this suffix. gitlab server does not, but it is harmless.
95+
return (prefix, suffix)
96+
6097
@property
6198
def pr(self):
6299
'''The GitLab terminology is merge request (MR) instead of pull request (PR)'''
@@ -597,3 +634,24 @@ def generate_link_to_relevant_line_number(self, suggestion) -> str:
597634
get_logger().info(f"Failed adding line link, error: {e}")
598635

599636
return ""
637+
#Clone related
638+
def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None:
639+
if "gitlab." not in repo_url_to_clone:
640+
get_logger().error(f"Repo URL: {repo_url_to_clone} is not a valid gitlab URL.")
641+
return None
642+
(scheme, base_url) = repo_url_to_clone.split("gitlab.")
643+
access_token = self.gl.oauth_token
644+
if not all([scheme, access_token, base_url]):
645+
get_logger().error(f"Either no access token found, or repo URL: {repo_url_to_clone} "
646+
f"is missing prefix: {scheme} and/or base URL: {base_url}.")
647+
return None
648+
649+
#Note that the ""official"" method found here:
650+
# https://docs.gitlab.com/user/profile/personal_access_tokens/#clone-repository-using-personal-access-token
651+
# requires a username, which may not be applicable.
652+
# The following solution is taken from: https://stackoverflow.com/questions/25409700/using-gitlab-token-to-clone-without-authentication/35003812#35003812
653+
# For example: For repo url: https://gitlab.codium-inc.com/qodo/autoscraper.git
654+
# Then to clone one will issue: 'git clone https://oauth2:<access token>@gitlab.codium-inc.com/qodo/autoscraper.git'
655+
656+
clone_url = f"{scheme}oauth2:{access_token}@gitlab.{base_url}"
657+
return clone_url

‎pr_agent/servers/help.py

+15
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ def get_general_commands_text():
66
"> - **/improve [--extended]**: Suggest code improvements. Extended mode provides a higher quality feedback. \n" \
77
"> - **/ask \\<QUESTION\\>**: Ask a question about the PR. \n" \
88
"> - **/update_changelog**: Update the changelog based on the PR's contents. \n" \
9+
"> - **/help_docs \\<QUESTION\\>**: Given a path to documentation (either for this repository or for a given one), ask a question. \n" \
910
"> - **/add_docs** 💎: Generate docstring for new components introduced in the PR. \n" \
1011
"> - **/generate_labels** 💎: Generate labels for the PR based on the PR's contents. \n" \
1112
"> - **/analyze** 💎: Automatically analyzes the PR, and presents changes walkthrough for each component. \n\n" \
@@ -201,3 +202,17 @@ def get_improve_usage_guide():
201202
output += f"\n\nSee the improve [usage page](https://pr-agent-docs.codium.ai/tools/improve/) for a comprehensive guide on using this tool.\n\n"
202203

203204
return output
205+
206+
207+
@staticmethod
208+
def get_help_docs_usage_guide():
209+
output = "**Overview:**\n"
210+
output += """\
211+
The help docs tool, named `help_docs`, answers a question based on a given relative path of documentation, either from the repository of this merge request or from a given one."
212+
It can be invoked manually by commenting on any PR:
213+
```
214+
/help_docs "..."
215+
```
216+
"""
217+
output += f"\n\nSee the [help_docs usage](https://pr-agent-docs.codium.ai/tools/help_docs/) page for a comprehensive guide on using this tool.\n\n"
218+
return output

‎pr_agent/settings/configuration.toml

+9
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
model="o3-mini"
1010
fallback_models=["gpt-4o-2024-11-20"]
1111
#model_weak="gpt-4o-mini-2024-07-18" # optional, a weaker model to use for some easier tasks
12+
model_token_count_estimate_factor=0.3 # factor to increase the token count estimate, in order to reduce likelihood of model failure due to too many tokens.
1213
# CLI
1314
git_provider="github"
1415
publish_output=true
@@ -212,6 +213,14 @@ num_retrieved_snippets=5
212213

213214
[pr_config] # /config #
214215

216+
[pr_help_docs]
217+
repo_url = "" #If not overwritten, will use the repo from where the context came from (issue or PR)
218+
repo_default_branch = "main"
219+
docs_path = "docs"
220+
exclude_root_readme = false
221+
supported_doc_exts = [".md", ".mdx", ".rst"]
222+
enable_help_text=false
223+
215224
[github]
216225
# The type of deployment to create. Valid values are 'app' or 'user'.
217226
deployment_type = "user"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
[pr_help_docs_prompts]
2+
system="""You are Doc-helper, a language model designed to answer questions about a documentation website for a given repository.
3+
You will receive a question, a repository url and the full documentation content for that repository (either as markdown or as restructred text).
4+
Your goal is to provide the best answer to the question using the documentation provided.
5+
6+
Additional instructions:
7+
- Be short and concise in your answers. Give examples if needed.
8+
- Answer only questions that are related to the documentation website content. If the question is completely unrelated to the documentation, return an empty response.
9+
10+
11+
The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions:
12+
=====
13+
class relevant_section(BaseModel):
14+
file_name: str = Field(description="The name of the relevant file")
15+
relevant_section_header_string: str = Field(description="The exact text of the relevant markdown/restructured text section heading from the relevant file (starting with '#', '##', etc.). Return empty string if the entire file is the relevant section, or if the relevant section has no heading")
16+
17+
class DocHelper(BaseModel):
18+
user_question: str = Field(description="The user's question")
19+
response: str = Field(description="The response to the user's question")
20+
relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown/restructured text sections in the documentation that answer the user's question, ordered by importance (most relevant first)")
21+
question_is_relevant: int = Field(description="Return 1 if the question is somewhat relevant to documentation. 0 - otherwise")
22+
=====
23+
24+
25+
Example output:
26+
```yaml
27+
user_question: |
28+
...
29+
response: |
30+
...
31+
relevant_sections:
32+
- file_name: "src/file1.py"
33+
relevant_section_header_string: |
34+
...
35+
- ...
36+
question_is_relevant: |
37+
1
38+
"""
39+
40+
user="""\
41+
Documentation url: '{{ docs_url| trim }}'
42+
-----
43+
44+
45+
User's Question:
46+
=====
47+
{{ question|trim }}
48+
=====
49+
50+
51+
Documentation website content:
52+
=====
53+
{{ snippets|trim }}
54+
=====
55+
56+
57+
Reminder: The output must be a YAML object equivalent to type $DocHelper, similar to the following example output:
58+
=====
59+
Example output:
60+
```yaml
61+
user_question: |
62+
...
63+
response: |
64+
...
65+
relevant_sections:
66+
- file_name: "src/file1.py"
67+
relevant_section_header_string: |
68+
...
69+
- ...
70+
question_is_relevant: |
71+
1
72+
=====
73+
74+
75+
Response (should be a valid YAML, and nothing else).
76+
```yaml
77+
"""

‎pr_agent/tools/pr_help_docs.py

+369
Large diffs are not rendered by default.

‎pr_agent/tools/pr_help_message.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] =
3535
self.ai_handler = ai_handler()
3636
self.question_str = self.parse_args(args)
3737
self.return_as_string = return_as_string
38-
self.num_retrieved_snippets = get_settings().get('pr_help.num_retrieved_snippets', 5)
3938
if self.question_str:
4039
self.vars = {
4140
"question": self.question_str,
@@ -209,6 +208,7 @@ async def run(self):
209208
tool_names.append(f"[REVIEW]({base_path}/review/)")
210209
tool_names.append(f"[IMPROVE]({base_path}/improve/)")
211210
tool_names.append(f"[UPDATE CHANGELOG]({base_path}/update_changelog/)")
211+
tool_names.append(f"[HELP DOCS]({base_path}/help_docs/)")
212212
tool_names.append(f"[ADD DOCS]({base_path}/documentation/) 💎")
213213
tool_names.append(f"[TEST]({base_path}/test/) 💎")
214214
tool_names.append(f"[IMPROVE COMPONENT]({base_path}/improve_component/) 💎")
@@ -224,6 +224,7 @@ async def run(self):
224224
descriptions.append("Adjustable feedback about the PR, possible issues, security concerns, review effort and more")
225225
descriptions.append("Code suggestions for improving the PR")
226226
descriptions.append("Automatically updates the changelog")
227+
descriptions.append("Answers a question regarding this repository, or a given one, based on given documentation path")
227228
descriptions.append("Generates documentation to methods/functions/classes that changed in the PR")
228229
descriptions.append("Generates unit tests for a specific component, based on the PR code change")
229230
descriptions.append("Code suggestions for a specific component that changed in the PR")
@@ -240,6 +241,7 @@ async def run(self):
240241
commands.append("`/review`")
241242
commands.append("`/improve`")
242243
commands.append("`/update_changelog`")
244+
commands.append("`/help_docs`")
243245
commands.append("`/add_docs`")
244246
commands.append("`/test`")
245247
commands.append("`/improve_component`")
@@ -255,6 +257,7 @@ async def run(self):
255257
checkbox_list.append(" - [ ] Run <!-- /review -->")
256258
checkbox_list.append(" - [ ] Run <!-- /improve -->")
257259
checkbox_list.append(" - [ ] Run <!-- /update_changelog -->")
260+
checkbox_list.append(" - [ ] Run <!-- /help_docs -->")
258261
checkbox_list.append(" - [ ] Run <!-- /add_docs -->")
259262
checkbox_list.append(" - [ ] Run <!-- /test -->")
260263
checkbox_list.append(" - [ ] Run <!-- /improve_component -->")

‎requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
aiohttp==3.9.5
2-
anthropic[vertex]==0.47.1
2+
anthropic>=0.48
3+
#anthropic[vertex]==0.47.1
34
atlassian-python-api==3.41.4
45
azure-devops==7.1.0b3
56
azure-identity==1.15.0

0 commit comments

Comments
 (0)
Please sign in to comment.