Default functions and format
June 25, 2025 · View on GitHub
We provide default functions and format in our benchmark, including prompt building, preprocessing, postprocessing and parsing functions. We elaborate these functions and their outputs format here, and thus if it satisfy your requirement, you only need to set a system prompt, user prompt template and parsing functions. Our parsing functions are based on regex and it is robust I think, so maybe you even do not need to implement parsing function.
Prompt construction
L1-GUI Content Understanding
function:
L1_SYSTEM_PROMPT_DEFAULT="""You are a GUI agent. You are given a screenshot of an application, a question and corresponding options. You need to choose one option as your answer for the question. Finally, you are ONLY allowed to return the single letter of your choice."""
def build_prompt(self, line, use_system=True, custom_system_prompt=None):
tgt_path = self.dump_image(line)
question = line["question"]
options = {cand: line["options"][cand] for cand in string.ascii_uppercase}
options_prompt = "Options:\n"
for key, item in options.items():
options_prompt += f"{key}. {item}\n"
user_prompt = ""
user_prompt += f"Question: {question}\n"
if len(options):
user_prompt += options_prompt
user_prompt += "Please select the correct answer from the options above. \n"
msgs = []
if use_system:
system_prompt = (
L1_SYSTEM_PROMPT_DEFAULT
if (custom_system_prompt is None) or (custom_system_prompt == "")
else custom_system_prompt
)
msgs.append(dict(role="system", type="text", value=system_prompt))
msgs = [dict(role="user", type="image", value=tgt_path)]
msgs.append(dict(role="user", type="text", value=user_prompt))
return msgs
output format (we enable system prompt for example):
[
{
"role": "system",
"type": "text",
"value": "You are a GUI agent. You are given a screenshot of an application, a question and corresponding options. You need to choose one option as your answer for the question. Finally, you are ONLY allowed to return the single letter of your choice."
},
{
"role": "user",
"type": "image",
"value": "/path/of/your/dataroot/os_ios/9e304d4e_5fdc3924_51c74094e7e217f384edd0d882ea6fb19b839ddc029893daa6dd17fafb49b3d6.png"
},
{
"role": "user",
"type": "text",
"value": """Question: Based on the navigation elements, what can be inferred about the current screen's position in the app's hierarchy?
Options:
A. It's a sub-screen within a 'Rings' section
B. It's the main dashboard of the app
C. It's a sub-screen within the 'Summary' section
D. It's a standalone 'Awards' page accessible from anywhere
E. It's the 'Sharing' section of the app
Please select the correct answer from the options above."""
},
]
L2-GUI Element Grounding
function:
L2_SYSTEM_PROMPT_DEFAULT = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to finish this task following instructions from users."""
L2_USER_PROMPT_DEFAULT = """Output only the coordinate (x,y) of one point in your response. What element matches the following task: {instruction}"""
def build_prompt(self, line, use_system=True, custom_system_prompt=None):
tgt_path = self.dump_image(line)
user_prompt_template = os.environ.get("L2_USER_PROMPT", L2_USER_PROMPT_DEFAULT)
user_prompt = user_prompt_template.format(instruction=line["instruction"])
msgs = []
if use_system:
system_prompt = (
L2_SYSTEM_PROMPT_DEFAULT
if (custom_system_prompt is None) or (custom_system_prompt == "")
else custom_system_prompt
)
msgs.append(dict(role="system", type="text", value=system_prompt))
msgs = [dict(role="user", type="image", value=tgt_path)]
msgs.append(dict(role="user", type="text", value=user_prompt))
return msgs
output format (we enable system prompt for example):
[
{
"role": "system",
"type": "text",
"value": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to finish this task following instructions from users."
},
{
"role": "user",
"type": "image",
"value": "/path/of/your/dataroot/os_windows/0b08bd98_a0e7b2a5_68e346390d562be39f55c1aa7db4a5068d16842c0cb29bd1c6e3b49292a242d1.png"
},
{
"role": "user",
"type": "text",
"value": "Output only the coordinate (x,y) of one point in your response. What element matches the following task: The downward arrow button allows you to scroll down through the list of years."
},
]
Preprocess
Local deployment model
function:
def default_preprocess_function(self, message, model, processor, **kwargs):
# We apply `process_vision_info` function of qwen_vl_utils as our default process function for visual part.
from qwen_vl_utils import process_vision_info
messages = []
if "system" == message[0]["role"]:
messages.append({"role": "system", "content": message[0]["value"]})
message = message[1:]
messages.append(
{
"role": "user",
"content": simple_prepare_content(message, processor, **kwargs),
}
)
text = processor.apply_chat_template(
[messages], tokenize=False, add_generation_prompt=True
)
images, videos = process_vision_info([messages])
inputs = processor(
text=text, images=images, videos=videos, padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
return inputs
def simple_prepare_content(inputs, processor, **kwargs):
content = []
for s in inputs:
if s["type"] == "image":
item = {"type": "image", "image": ensure_image_url(s["value"])}
elif s["type"] == "text":
item = {"type": "text", "text": s["value"]}
elif s["type"] == "audio":
item = {"type": "audio", "audio": s["value"]}
else:
raise ValueError(f"Invalid message type: {s['type']}, {s}")
content.append(item)
return content
API-based model
output format (we enable system prompt for example):
[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to finish this task following instructions from users."
}
],
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": "data:image/png;base64,xxxxxxxxxxxxxxxxxxxxxxxxx......xxxxxxxxxxxxxxxxxx"
},
{
"type": "text",
"text": "Output only the coordinate (x,y) of one point in your response. What element matches the following task: The downward arrow button allows you to scroll down through the list of years."
}
]
}
]
Postprocess
Local deployment model
We decode the predicted ids into readable text strings and then directly return them.
def default_postprocess_function(self, message, model, processor, **kwargs):
outputs = outputs.cpu().numpy()
out = processor.batch_decode(outputs, skip_special_tokens=True)
response = out[0]
return response
API-based model
We directly return the response from api.
def default_postprocess_function(self, message, model, processor, **kwargs):
# Due to various response format when calling api model, we directly return the response as our default implementation.
# Thus, the desired output used for calculating matrics should be processed by `parse_function_xxxxxx` functions implemented in your model file.
return message
Parsing functions
L1-GUI Content Understanding
def parser_response_into_coordinates(text, meta=None):
"""
Default parse function for the response.
It should be overridden by the user if needed.
"""
pattern = r"""
(?:x\s*[:=]?\s*)?
[\(\[\{]?
\s*([-+]?(?:\d+\.\d+|\.\d+|\d+))\s*
[,\s;]+
(?:y\s*[:=]?\s*)?
\s*([-+]?(?:\d+\.\d+|\.\d+|\d+))\s*
[\)\]\}]?
"""
matches = re.findall(pattern, text, re.IGNORECASE | re.VERBOSE)
if len(matches) == 0:
return None
else:
return [(float(x), float(y)) for x, y in matches][
0
] # we select the first if multiple (x,y) are parsed.
L2-GUI Element Grounding
def parser_answers_into_option(text, meta=None):
patterns = [
r"\b([A-F])[\.:](?!\w)", # parser: A. B:
r"\bOption\s+([A-F])\b", # Option A / Option B
r"\bAnswer\s*[::]?\s*([A-F])\b", # Answer: A
r"^[ \t]*([A-F])[\.:]?", # the first letter in a row: A. or A:
r"[\'\"]([A-F])[\'\"]", # 'A' or "B"
r"\b([A-F])\b(?!\s+\w)", # a single alphabet A-F, without following word
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
return match.group(1).upper()
return None