Files
Algo_Repo/ac_specs_extraction.py
Aditya Thaker 9cc5f634ee initial commit
2025-06-05 18:08:36 +05:30

142 lines
5.1 KiB
Python

import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
import io
from PIL import Image
import json
import os
import pandas as pd
from tqdm import tqdm
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
image_bytes = image_file.read()
encoded_string = base64.b64encode(image_bytes).decode("utf-8")
return encoded_string
def generate(image_path):
vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1")
model = GenerativeModel(
"gemini-1.5-pro-002",
)
generation_config = {
"max_output_tokens": 8192,
"temperature": 0.3,
"top_p": 0.95,
"response_mime_type": "application/json"
}
safety_settings = [
SafetySetting(
category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
),
SafetySetting(
category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
),
SafetySetting(
category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
),
SafetySetting(
category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
),
]
base64_string = image_to_base64(image_path)
image = Part.from_data(
mime_type="image/jpeg",
data=base64.b64decode(base64_string)
)
prompt = """Extract the following information from the image in JSON format.
STAR RATING --> INTEGER
ISEER --> FLOAT
EER (W/W) --> FLOAT
Label Period Start Date --> DATE
Label Period End Date --> DATE
ApplianceType --> STRING
Brand --> STRING
Model --> STRING
Year --> DATE (YYYY)
Cooling Capacity (100%) (W) --> INTEGER
Cooling Capacity (50%)(W) --> INTEGER
Cooling Capacity (W) --> INTEGER
Electricity Consumption (UNITS) --> FLOAT
Power Consumption (W) --> FLOAT
Compressor Type --> STRING
Variable Speed Compressor --> BOOLEAN(YES/NO)
Heat Pump --> BOOLEAN(YES/NO)
MODEL NO. --> STRING
MODEL NAME. --> STRING
INPUT POWER --> INTEGER
REFRIGERANT --> STRING
ISO --> STRING
OPERATING HOURS PER ANNUM --> INTEGER
Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG.
If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING
will be 5.
Provide blank value "" for fields that are not applicable/found in the image.
The value for boolean fields should be "YES" or "NO".
There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change.
"""
responses = model.generate_content(
[image, prompt],
generation_config=generation_config,
safety_settings=safety_settings,
)
return json.loads(responses.text)
def process_files(directory_path):
list_df = []
print(f"Processing files in directory: {directory_path}")
image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]
if not image_files:
print("No image files found in the directory.")
return pd.DataFrame(list_df)
for filename in tqdm(image_files, desc="Processing images"):
filepath = os.path.join(directory_path, filename)
try:
output_response = generate(filepath)
output_response["File Name"] = str(filename)
list_df.append(output_response)
except Exception as e:
print(f"Error processing file {filename}: {e}")
list_df.append({"File Name": str(filename), "Error": str(e)})
continue
data = pd.DataFrame(list_df)
return data
if __name__ == "__main__":
directory_path = 'FOLDER4' # Make sure this folder exists and contains images
if not os.path.isdir(directory_path):
print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.")
else:
extracted_data = process_files(directory_path)
if not extracted_data.empty:
print("\n--- Extracted Data (First 5 rows) ---")
print(extracted_data.head())
output_csv_path = 'extracted_image_data.csv'
try:
extracted_data.to_csv(output_csv_path, index=False)
print(f"\nSuccessfully saved extracted data to {output_csv_path}")
except Exception as e:
print(f"\nError saving data to CSV: {e}")
else:
print("\nNo data was extracted.")