8000 Merge pull request #59 from IBM/0.3.0 · codellm-devkit/python-sdk@b5c663b · GitHub
[go: up one dir, main page]

Skip to content

Commit b5c663b

Browse files
authored
Merge pull request #59 from IBM/0.3.0
Reads slim JSON from codeanalyzer v2.0.0.
2 parents b3d9fb7 + acc9b42 commit b5c663b

File tree

12 files changed

+173412
-250
lines changed

12 files changed

+173412
-250
lines changed

cldk/analysis/analysis_level.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
class AnalysisLevel(str, Enum):
2525
"""Analysis levels"""
2626

27-
symbol_table = "symbol-table"
28-
call_graph = "call-graph"
29-
program_dependency_graph = "program-dependency-graph"
30-
system_dependency_graph = "system-dependency-graph"
27+
symbol_table = "symbol table"
28+
call_graph = "call graph"
29+
program_dependency_graph = "program dependency graph"
30+
system_dependency_graph = "system dependency graph"

cldk/analysis/java/codeanalyzer/codeanalyzer.py

Lines changed: 10 additions & 64 deletions
< 10000 td data-grid-cell-id="diff-f754616573d59fdcb72acf326dd25bf233d622028b9ed7e76a77b2cdf1e818f6-170-118-2" data-line-anchor="diff-f754616573d59fdcb72acf326dd25bf233d622028b9ed7e76a77b2cdf1e818f6L170" data-selected="false" role="gridcell" style="background-color:var(--diffBlob-deletionLine-bgColor, var(--diffBlob-deletion-bgColor-line));padding-right:24px" tabindex="-1" valign="top" class="focusable-grid-cell diff-text-cell left-side-diff-cell border-right left-side">-
logger.info(f"Codeanalzyer jar is already at the latest version.")
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414
# limitations under the License.
1515
################################################################################
1616

17-
"""
18-
Codeanalyzer module
19-
"""
20-
2117
import re
2218
import json
2319
import shlex
@@ -120,61 +116,6 @@ def __init__(
120116
else:
121117
self.call_graph: DiGraph | None = None
122118

123-
@staticmethod
124-
def _download_or_update_code_analyzer(filepath: Path) -> str:
125-
"""Downloads the codeanalyzer jar from the latest release on GitHub.
126-
127-
Args:
128-
filepath (Path): The path to save the codeanalyzer jar.
129-
130-
Returns:
131-
str: The path to the downloaded codeanalyzer jar file.
132-
"""
133-
url = "https://api.github.com/repos/IBM/codenet-minerva-code-analyzer/releases/latest"
134-
response = requests.get(url)
135-
date_format = "%Y%m%dT%H%M%S"
136-
if response.status_code == 200:
137-
for asset in response.json().get("assets", []):
138-
if asset["name"] == "codeanalyzer.jar":
139-
download_url = asset["browser_download_url"]
140-
pattern = r"(\d{8}T\d{6})"
141-
match = re.search(pattern, download_url)
142-
if match:
143-
datetime_str = match.group(0)
144-
else:
145-
raise Exception(f"Release URL {download_url} does not contain a datetime pattern.")
146-
147-
# Look for codeanalyzer.YYYYMMDDTHHMMSS.jar in the filepath
148-
current_codeanalyzer_jars = [jarfile for jarfile in filepath.glob("*.jar")]
149-
if not any(current_codeanalyzer_jars):
150-
logger.info(f"Codeanalzyer jar is not found. Downloading the latest version.")
151-
filename = filepath / f"codeanalyzer.{datetime_str}.jar"
152-
urlretrieve(download_url, filename)
153-
return filename.__str__()
154-
155-
current_codeanalyzer_jar_name = current_codeanalyzer_jars[0]
156-
match = re.search(pattern, current_codeanalyzer_jar_name.__str__())
157-
if match:
158-
current_datetime_str = match.group(0)
159-
160-
if datetime.strptime(datetime_str, date_format) > datetime.strptime(current_datetime_str, date_format):
161-
logger.info(f"Codeanalzyer jar is outdated. Downloading the latest version.")
162-
# Remove the older codeanalyzer jar
163-
for jarfile in current_codeanalyzer_jars:
164-
jarfile.unlink()
165-
# Download the newer codeanalyzer jar
166-
filename = filepath / f"codeanalyzer.{datetime_str}.jar"
167-
urlretrieve(download_url, filename)
168-
else:
169-
filename = current_codeanalyzer_jar_name
170
171-
else:
172-
filename = current_codeanalyzer_jar_name
173-
174-
return filename.__str__()
175-
else:
176-
raise Exception(f"Failed to fetch release warn: {response.status_code} {response.text}")
177-
178119
def _get_application(self) -> JApplication:
179120
"""Returns the application view of the Java code.
180121
@@ -204,14 +145,15 @@ def _get_codeanalyzer_exec(self) -> List[str]:
204145

205146
if self.analysis_backend_path:
206147
analysis_backend_path = Path(self.analysis_backend_path)
207-
logger.info(f"Using codeanalyzer.jar from {analysis_backend_path}")
208-
codeanalyzer_exec = shlex.split(f"java -jar {analysis_backend_path / 'codeanalyzer.jar'}")
148+
logger.info(f"Using codeanalyzer jar from {analysis_backend_path}")
149+
codeanalyzer_jar_file = next(analysis_backend_path.rglob("codeanalyzer-*.jar"), None)
150+
codeanalyzer_exec = shlex.split(f"java -jar {codeanalyzer_jar_file}")
209151
else:
210152
# Since the path to codeanalyzer.jar was not provided, we'll download the latest version from GitHub.
211153
with resources.as_file(resources.files("cldk.analysis.java.codeanalyzer.jar")) as codeanalyzer_jar_path:
212154
# Download the codeanalyzer jar if it doesn't exist, update if it's outdated,
213155
# do nothing if it's up-to-date.
214-
codeanalyzer_jar_file = self._download_or_update_code_analyzer(codeanalyzer_jar_path)
156+
codeanalyzer_jar_file = next(codeanalyzer_jar_path.rglob("codeanalyzer-*.jar"), None)
215157
codeanalyzer_exec = shlex.split(f"java -jar {codeanalyzer_jar_file}")
216158
return codeanalyzer_exec
217159

@@ -372,11 +314,15 @@ def _generate_call_graph(self, using_symbol_table) -> DiGraph:
372314
{
373315
"type": jge.type,
374316
"weight": jge.weight,
375-
"calling_lines": tsu.get_calling_lines(jge.source.method.code, jge.target.method.signature),
317+
"calling_lines": (
318+
tsu.get_calling_lines(jge.source.method.code, jge.target.method.signature, jge.target.method.is_constructor)
319+
if not jge.source.method.is_implicit or not jge.target.method.is_implicit
320+
else []
321+
),
376322
},
377323
)
378324
for jge in sdg
379-
if jge.type == "CONTROL_DEP" or jge.type == "CALL_DEP"
325+
if jge.type == "CALL_DEP" # or jge.type == "CONTROL_DEP"
380326
]
381327
for jge in sdg:
382328
cg.add_node(

cldk/analysis/java/treesitter/javasitter.py

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
"""
1818
JavaSitter module
1919
"""
20-
2120
from itertools import groupby
2221
from typing import List, Set, Dict
2322
from tree_sitter import Language, Node, Parser, Query, Tree
@@ -26,6 +25,10 @@
2625

2726
from cldk.models.treesitter import Captures
2827

28+
import logging
29+
30+
logger = logging.getLogger(__name__)
31+
2932

3033
class JavaSitter:
3134
"""
@@ -51,8 +54,7 @@ def method_is_not_in_class(self, method_name: str, class_body: str) -> bool:
5154
bool
5255
True if the method is in the class, False otherwise.
5356
"""
54-
methods_in_class = self.frame_query_and_capture_output("(method_declaration name: (identifier) @name)",
55-
class_body)
57+
methods_in_class = self.frame_query_and_capture_output("(method_declaration name: (identifier) @name)", class_body)
5658

5759
return method_name not in {method.node.text.decode() for method in methods_in_class}
5860

@@ -103,8 +105,7 @@ def get_all_imports(self, source_code: str) -> Set[str]:
103105
Returns:
104106
Set[str]: A set of all the imports in the class.
105107
"""
106-
import_declerations: Captures = self.frame_query_and_capture_output(
107-
query="(import_declaration (scoped_identifier) @name)", code_to_process=source_code)
108+
import_declerations: Captures = self.frame_query_and_capture_output(query="(import_declaration (scoped_identifier) @name)", code_to_process=source_code)
108109
return {capture.node.text.decode() for capture in import_declerations}
109110

110111
def get_pacakge_name(self, source_code: str) -> str:
@@ -116,8 +117,7 @@ def get_pacakge_name(self, source_code: str) -> str:
116117
Returns:
117118
str: The package name.
118119
"""
119-
package_name: Captures = self.frame_query_and_capture_output(query="((package_declaration) @name)",
120-
code_to_process=source_code)
120+
package_name: Captures = self.frame_query_and_capture_output(query="((package_declaration) @name)", code_to_process=source_code)
121121
if package_name:
122122
return package_name[0].node.text.decode().replace("package ", "").replace(";", "")
123123
return None
@@ -143,8 +143,7 @@ def get_superclass(self, source_code: str) -> str:
143143
Returns:
144144
Set[str]: A set of all the superclasses in the class.
145145
"""
146-
superclass: Captures = self.frame_query_and_capture_output(
147-
query="(class_declaration (superclass (type_identifier) @superclass))", code_to_process=source_code)
146+
superclass: Captures = self.frame_query_and_capture_output(query="(class_declaration (superclass (type_identifier) @superclass))", code_to_process=source_code)
148147

149148
if len(superclass) == 0:
150149
return ""
@@ -161,9 +160,7 @@ def get_all_interfaces(self, source_code: str) -> Set[str]:
161160
Set[str]: A set of all the interfaces implemented by the class.
162161
"""
163162

164-
interfaces = self.frame_query_and_capture_output(
165-
"(class_declaration (super_interfaces (type_list (type_identifier) @interface)))",
166-
code_to_process=source_code)
163+
interfaces = self.frame_query_and_capture_output("(class_declaration (super_interfaces (type_list (type_identifier) @interface)))", code_to_process=source_code)
167164
return {interface.node.text.decode() for interface in interfaces}
168165

169166
def frame_query_and_capture_output(self, query: str, code_to_process: str) -> Captures:
@@ -182,8 +179,7 @@ def frame_query_and_capture_output(self, query: str, code_to_process: str) -> Ca
182179

183180
def get_method_name_from_declaration(self, method_name_string: str) -> str:
184181
"""Get the method name from the method signature."""
185-
captures: Captures = self.frame_query_and_capture_output("(method_declaration name: (identifier) @method_name)",
186-
method_name_string)
182+
captures: Captures = self.frame_query_and_capture_output("(method_declaration name: (identifier) @method_name)", method_name_string)
187183

188184
return captures[0].node.text.decode()
189185

@@ -192,8 +188,12 @@ def get_method_name_from_invocation(self, method_invocation: str) -> str:
192188
Using the tree-sitter query, extract the method name from the method invocation.
193189
"""
194190

195-
captures: Captures = self.frame_query_and_capture_output(
196-
"(method_invocation object: (identifier) @class_name name: (identifier) @method_name)", method_invocation)
191+
captures: Captures = self.frame_query_and_capture_output("(method_invocation name: (identifier) @method_name)", method_invocation)
192+
return captures[0].node.text.decode()
193+
194+
def get_identifier_from_arbitrary_statement(self, statement: str) -> str:
195+
"""Get the identifier from an arbitrary statement."""
196+
captures: Captures = self.frame_query_and_capture_output("(identifier) @identifier", statement)
197197
return captures[0].node.text.decode()
198198

199199
def safe_ascend(self, node: Node, ascend_count: int) -> Node:
@@ -260,7 +260,7 @@ def get_call_targets(self, method_body: str, declared_methods: dict) -> Set[str]
260260
)
261261
return call_targets
262262

263-
def get_calling_lines(self, source_method_code: str, target_method_name: str) -> List[int]:
263+
def get_calling_lines(self, source_method_code: str, target_method_name: str, is_target_method_a_constructor: bool) -> List[int]:
264264
"""
265265
Returns a list of line numbers in source method where target method is called.
266266
@@ -272,26 +272,34 @@ def get_calling_lines(self, source_method_code: str, target_method_name: str) ->
272272
target_method_code : str
273273
target method code
274274
275+
is_target_method_a_constructor : bool
276+
True if target method is a constructor, False otherwise.
277+
275278
Returns:
276279
--------
277280
List[int]
278281
List of line numbers within in source method code block.
279282
"""
280-
query = "(method_invocation name: (identifier) @method_name)"
283+
if not source_method_code:
284+
return []
285+
query = "(object_creation_expression (type_identifier) @object_name) (object_creation_expression type: (scoped_type_identifier (type_identifier) @type_name)) (method_invocation name: (identifier) @method_name)"
286+
281287
# if target_method_name is a method signature, get the method name
282288
# if it is not a signature, we will just keep the passed method name
289+
290+
target_method_name = target_method_name.split("(")[0] # remove the arguments from the constructor name
283291
try:
284-
target_method_name = self.get_method_name_from_declaration(target_method_name)
285-
except Exception:
286-
pass
287-
288-
captures: Captures = self.frame_query_and_capture_output(query, source_method_code)
289-
# Find the line numbers where target method calls happen in source method
290-
target_call_lines = []
291-
for c in captures:
292-
method_name = c.node.text.decode()
293-
if method_name == target_method_name:
294-
target_call_lines.append(c.node.start_point[0])
292+
captures: Captures = self.frame_query_and_capture_output(query, source_method_code)
293+
# Find the line numbers where target method calls happen in source method
294+
target_call_lines = []
295+
for c in captures:
296+
method_name = c.node.text.decode()
297+
if method_name == target_method_name:
298+
target_call_lines.append(c.node.start_point[0])
299+
except:
300+
logger.warning(f"Unable to get calling lines for {target_method_name} in {source_method_code}.")
301+
return []
302+
295303
return target_call_lines
296304

297305
def get_test_methods(self, source_class_code: str) -> Dict[str, str]:
@@ -398,8 +406,7 @@ def get_method_return_type(self, source_code: str) -> str:
398406
The return type of the method.
399407
"""
400408

401-
type_references: Captures = self.frame_query_and_capture_output(
402-
"(method_declaration type: ((type_identifier) @type_id))", source_code)
409+
type_references: Captures = self.frame_query_and_capture_output("(method_declaration type: ((type_identifier) @type_id))", source_code)
403410

404411
return type_references[0].node.text.decode()
405412

@@ -426,9 +433,9 @@ def collect_leaf_token_values(node):
426433
if len(node.children) == 0:
427434
if filter_by_node_type is not None:
428435
if node.type in filter_by_node_type:
429-
lexical_tokens.append(code[node.start_byte: node.end_byte])
436+
lexical_tokens.append(code[node.start_byte : node.end_byte])
430437
else:
431-
lexical_tokens.append(code[node.start_byte: node.end_byte])
438+
lexical_tokens.append(code[node.start_byte : node.end_byte])
432439
else:
433440
for child in node.children:
434441
collect_leaf_token_values(child)
@@ -462,11 +469,9 @@ def remove_all_comments(self, source_code: str) -> str:
462469
pruned_source_code = self.make_pruned_code_prettier(source_code)
463470

464471
# Remove all comment lines: the comment lines start with / (for // and /*) or * (for multiline comments).
465-
comment_blocks: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)",
466-
code_to_process=source_code)
472+
comment_blocks: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)", code_to_process=source_code)
467473

468-
comment_lines: Captures = self.frame_query_and_capture_output(query="((line_comment) @comment_line)",
469-
code_to_process=source_code)
474+
comment_lines: Captures = self.frame_query_and_capture_output(query="((line_comment) @comment_line)", code_to_process=source_code)
470475

471476
for capture in comment_blocks:
472477
pruned_source_code = pruned_source_code.replace(capture.node.text.decode(), "")
@@ -490,8 +495,7 @@ def make_pruned_code_prettier(self, pruned_code: str) -> str:
490495
The prettified pruned code.
491496
"""
492497
# First remove remaining block comments
493-
block_comments: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)",
494-
code_to_process=pruned_code)
498+
block_comments: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)", code_to_process=pruned_code)
495499

496500
for capture in block_comments:
497501
pruned_code = pruned_code.replace(capture.node.text.decode(), "")

cldk/models/java/models.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
"""
1818
Models module
1919
"""
20-
2120
import re
2221
from contextvars import ContextVar
2322
from typing import Dict, List, Optional
@@ -64,7 +63,7 @@ class JCallableParameter(BaseModel):
6463
modifiers (List[str]): The modifiers applied to the parameter.
6564
"""
6665

67-
name: str
66+
name: str | None
6867
type: str
6968
annotations: List[str]
7069
modifiers: List[str]
@@ -361,10 +360,30 @@ class JGraphEdges(BaseModel):
361360
@field_validator("source", "target", mode="before")
362361
@classmethod
363362
def validate_source(cls, value) -> JMethodDetail:
364-
file_path, type_declaration, callable_declaration = value["file_path"], value["type_declaration"], value["callable_declaration"]
365-
j_callable = _CALLABLES_LOOKUP_TABLE.get((file_path, type_declaration, callable_declaration), None)
366-
if j_callable is None:
367-
raise ValueError(f"Callable not found in lookup table: {file_path}, {type_declaration}, {callable_declaration}")
363+
_, type_declaration, signature = value["file_path"], value["type_declaration"], value["signature"]
364+
j_callable = _CALLABLES_LOOKUP_TABLE.get(
365+
(type_declaration, signature),
366+
JCallable(
367+
signature=signature,
368+
is_implicit=True,
369+
is_constructor="<init>" in value["callable_declaration"],
370+
comment="",
371+
annotations=[],
372+
modifiers=[],
373+
thrown_exceptions=[],
374+
declaration="",
375+
parameters=[JCallableParameter(name=None, type=t, annotations=[], modifiers=[]) for t in value["callable_declaration"].split("(")[1].split(")")[0].split(",")],
376+
code="",
377+
start_line=-1,
378+
end_line=-1,
379+
referenced_types=[],
380+
accessed_fields=[],
381+
call_sites=[],
382+
variable_declarations=[],
383+
cyclomatic_complexity=0,
384+
),
385+
)
386+
_CALLABLES_LOOKUP_TABLE[(type_declaration, signature)] = j_callable
368387
class_name = type_declaration
369388
method_decl = j_callable.declaration
370389
return JMethodDetail(method_declaration=method_decl, klass=class_name, method=j_callable)
@@ -391,9 +410,8 @@ class JApplication(BaseModel):
391410
@field_validator("symbol_table", mode="after")
392411
@classmethod
393412
def validate_source(cls, symbol_table):
394-
395413
# Populate the lookup table for callables
396-
for file_path, j_compulation_unit in symbol_table.items():
414+
for _, j_compulation_unit in symbol_table.items():
397415
for type_declaration, jtype in j_compulation_unit.type_declarations.items():
398-
for callable_declaration, j_callable in jtype.callable_declarations.items():
399-
_CALLABLES_LOOKUP_TABLE[(file_path, type_declaration, callable_declaration)] = j_callable
416+
for __, j_callable in jtype.callable_declarations.items():
417+
_CALLABLES_LOOKUP_TABLE[(type_declaration, j_callable.signature)] = j_callable

0 commit comments

Comments
 (0)
0