initial commit

2025-06-05 18:08:36 +05:30
commit 9cc5f634ee
1 changed files with 142 additions and 0 deletions
--- a/ac_specs_extraction.py
+++ b/ac_specs_extraction.py
@@ -0,0 +1,142 @@
 import base64
 import vertexai
 from vertexai.generative_models import GenerativeModel, Part, SafetySetting
 import io
 from PIL import Image
 import json
 import os
 import pandas as pd
 from tqdm import tqdm
 def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()
        encoded_string = base64.b64encode(image_bytes).decode("utf-8")
    return encoded_string
 def generate(image_path):
    vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1")
    model = GenerativeModel(
        "gemini-1.5-pro-002",
    )
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 0.3,
        "top_p": 0.95,
        "response_mime_type": "application/json"
    }
    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
        ),
    ]
    base64_string = image_to_base64(image_path)
    image = Part.from_data(
        mime_type="image/jpeg",
        data=base64.b64decode(base64_string)
    )
    prompt = """Extract the following information from the image in JSON format.
            STAR RATING --> INTEGER
            ISEER --> FLOAT
            EER (W/W) --> FLOAT
            Label Period Start Date --> DATE
            Label Period End Date --> DATE
            ApplianceType --> STRING
            Brand --> STRING
            Model --> STRING
            Year --> DATE (YYYY)
            Cooling Capacity (100%) (W) --> INTEGER
            Cooling Capacity (50%)(W) --> INTEGER
            Cooling Capacity (W) --> INTEGER
            Electricity Consumption (UNITS) --> FLOAT
            Power Consumption (W) --> FLOAT
            Compressor Type --> STRING
            Variable Speed Compressor --> BOOLEAN(YES/NO)
            Heat Pump --> BOOLEAN(YES/NO)
            MODEL NO. --> STRING
            MODEL NAME. --> STRING
            INPUT POWER --> INTEGER
            REFRIGERANT --> STRING
            ISO --> STRING
            OPERATING HOURS PER ANNUM --> INTEGER
            Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG.
            If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING
            will be 5.
            Provide blank value "" for fields that are not applicable/found in the image.
            The value for boolean fields should be "YES" or "NO".
            There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change.
            """
    responses = model.generate_content(
        [image, prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
    )
    return json.loads(responses.text)
 def process_files(directory_path):
    list_df = []
    print(f"Processing files in directory: {directory_path}")
    image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]
    if not image_files:
        print("No image files found in the directory.")
        return pd.DataFrame(list_df)
    for filename in tqdm(image_files, desc="Processing images"):
        filepath = os.path.join(directory_path, filename)
        try:
            output_response = generate(filepath)
            output_response["File Name"] = str(filename)
            list_df.append(output_response)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            list_df.append({"File Name": str(filename), "Error": str(e)})
            continue
    data = pd.DataFrame(list_df)
    return data
 if __name__ == "__main__":
    directory_path = 'FOLDER4' # Make sure this folder exists and contains images
    if not os.path.isdir(directory_path):
        print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.")
    else:
        extracted_data = process_files(directory_path)
        if not extracted_data.empty:
            print("\n--- Extracted Data (First 5 rows) ---")
            print(extracted_data.head())
            output_csv_path = 'extracted_image_data.csv'
            try:
                extracted_data.to_csv(output_csv_path, index=False)
                print(f"\nSuccessfully saved extracted data to {output_csv_path}")
            except Exception as e:
                print(f"\nError saving data to CSV: {e}")
        else:
            print("\nNo data was extracted.")