initial commit
This commit is contained in:
142
ac_specs_extraction.py
Normal file
142
ac_specs_extraction.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import base64
|
||||
import vertexai
|
||||
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
|
||||
import io
|
||||
from PIL import Image
|
||||
import json
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def image_to_base64(image_path):
|
||||
with open(image_path, "rb") as image_file:
|
||||
image_bytes = image_file.read()
|
||||
encoded_string = base64.b64encode(image_bytes).decode("utf-8")
|
||||
return encoded_string
|
||||
|
||||
def generate(image_path):
|
||||
vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1")
|
||||
model = GenerativeModel(
|
||||
"gemini-1.5-pro-002",
|
||||
)
|
||||
|
||||
generation_config = {
|
||||
"max_output_tokens": 8192,
|
||||
"temperature": 0.3,
|
||||
"top_p": 0.95,
|
||||
"response_mime_type": "application/json"
|
||||
}
|
||||
|
||||
safety_settings = [
|
||||
SafetySetting(
|
||||
category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
||||
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
|
||||
),
|
||||
SafetySetting(
|
||||
category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
||||
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
|
||||
),
|
||||
SafetySetting(
|
||||
category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
||||
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
|
||||
),
|
||||
SafetySetting(
|
||||
category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
||||
threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
|
||||
),
|
||||
]
|
||||
|
||||
base64_string = image_to_base64(image_path)
|
||||
|
||||
image = Part.from_data(
|
||||
mime_type="image/jpeg",
|
||||
data=base64.b64decode(base64_string)
|
||||
)
|
||||
|
||||
prompt = """Extract the following information from the image in JSON format.
|
||||
STAR RATING --> INTEGER
|
||||
ISEER --> FLOAT
|
||||
EER (W/W) --> FLOAT
|
||||
Label Period Start Date --> DATE
|
||||
Label Period End Date --> DATE
|
||||
ApplianceType --> STRING
|
||||
Brand --> STRING
|
||||
Model --> STRING
|
||||
Year --> DATE (YYYY)
|
||||
Cooling Capacity (100%) (W) --> INTEGER
|
||||
Cooling Capacity (50%)(W) --> INTEGER
|
||||
Cooling Capacity (W) --> INTEGER
|
||||
Electricity Consumption (UNITS) --> FLOAT
|
||||
Power Consumption (W) --> FLOAT
|
||||
Compressor Type --> STRING
|
||||
Variable Speed Compressor --> BOOLEAN(YES/NO)
|
||||
Heat Pump --> BOOLEAN(YES/NO)
|
||||
MODEL NO. --> STRING
|
||||
MODEL NAME. --> STRING
|
||||
INPUT POWER --> INTEGER
|
||||
REFRIGERANT --> STRING
|
||||
ISO --> STRING
|
||||
OPERATING HOURS PER ANNUM --> INTEGER
|
||||
|
||||
Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG.
|
||||
If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING
|
||||
will be 5.
|
||||
|
||||
Provide blank value "" for fields that are not applicable/found in the image.
|
||||
The value for boolean fields should be "YES" or "NO".
|
||||
|
||||
There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change.
|
||||
"""
|
||||
|
||||
responses = model.generate_content(
|
||||
[image, prompt],
|
||||
generation_config=generation_config,
|
||||
safety_settings=safety_settings,
|
||||
)
|
||||
|
||||
return json.loads(responses.text)
|
||||
|
||||
def process_files(directory_path):
|
||||
list_df = []
|
||||
print(f"Processing files in directory: {directory_path}")
|
||||
image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]
|
||||
|
||||
if not image_files:
|
||||
print("No image files found in the directory.")
|
||||
return pd.DataFrame(list_df)
|
||||
|
||||
for filename in tqdm(image_files, desc="Processing images"):
|
||||
filepath = os.path.join(directory_path, filename)
|
||||
try:
|
||||
output_response = generate(filepath)
|
||||
output_response["File Name"] = str(filename)
|
||||
list_df.append(output_response)
|
||||
except Exception as e:
|
||||
print(f"Error processing file {filename}: {e}")
|
||||
list_df.append({"File Name": str(filename), "Error": str(e)})
|
||||
continue
|
||||
|
||||
data = pd.DataFrame(list_df)
|
||||
return data
|
||||
|
||||
if __name__ == "__main__":
|
||||
directory_path = 'FOLDER4' # Make sure this folder exists and contains images
|
||||
|
||||
if not os.path.isdir(directory_path):
|
||||
print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.")
|
||||
else:
|
||||
extracted_data = process_files(directory_path)
|
||||
|
||||
if not extracted_data.empty:
|
||||
print("\n--- Extracted Data (First 5 rows) ---")
|
||||
print(extracted_data.head())
|
||||
|
||||
output_csv_path = 'extracted_image_data.csv'
|
||||
try:
|
||||
extracted_data.to_csv(output_csv_path, index=False)
|
||||
print(f"\nSuccessfully saved extracted data to {output_csv_path}")
|
||||
except Exception as e:
|
||||
print(f"\nError saving data to CSV: {e}")
|
||||
else:
|
||||
print("\nNo data was extracted.")
|
||||
Reference in New Issue
Block a user