UtilityPole_Info/Pole_serial_sorter.py

import os
import shutil
import pytesseract
from PIL import Image
import re
from pytesseract import Output
import cv2
import numpy as np

# --- 사용자 설정 ---
# 원본 이미지 폴더 경로
TOP_SOURCE_DIR = '/Volumes/ExSSD/Working/용공추 사진/'
SOURCE_DIR = '/Volumes/ExSSD/Working/용공추 사진/Raw_Data_4'

# 결과 저장 폴더
SERIAL_FOLDER = os.path.join(TOP_SOURCE_DIR, '일련번호_사진')
NON_SERIAL_FOLDER = os.path.join(TOP_SOURCE_DIR, '일반_사진')

# 일련번호 정규표현식 (기본: 5~6자리 숫자)
#SERIAL_PATTERN = r'\b\d{5,6}\b'
SERIAL_PATTERN = r' '

def find_number_below_security_light_tesseract(data):
    texts = data['text']
    tops = data['top']
    heights = data['height']

    print(texts)

    # "보안등" 위치 찾기
    for i, text in enumerate(texts):
        if '보안등' in text:
            base_y = tops[i] + heights[i]  # 아래 기준점

            candidates = []
            for j, candidate_text in enumerate(texts):
                if re.fullmatch(r'\d+', candidate_text):  # 숫자인 경우
                    if tops[j] > base_y + 5:  # '보안등' 아래쪽에 있는지 확인
                        candidates.append((tops[j], candidate_text))

            if not candidates:
                return None

            # y값이 가장 가까운(위에 있는) 숫자 반환
            candidates.sort(key=lambda x: x[0])
            return candidates[0][1]

    return None  # "보안등"이 없으면 None


def extract_serial_number(text):
    matches = re.findall(SERIAL_PATTERN, text)
    return matches[0] if matches else None


def classify_and_extract():
    if not os.path.exists(SERIAL_FOLDER):
        os.makedirs(SERIAL_FOLDER)
    if not os.path.exists(NON_SERIAL_FOLDER):
        os.makedirs(NON_SERIAL_FOLDER)

    for root, _, files in os.walk(SOURCE_DIR):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                img_path = os.path.join(root, file)

                try:
                    img = Image.open(img_path)
                    text = pytesseract.image_to_string(img, lang='eng+kor')  # 한국어+영어 혼합 OCR
                    serial = extract_serial_number(text)

                    if serial:
                        dest_path = os.path.join(SERIAL_FOLDER, file)
                        shutil.copy2(img_path, dest_path)
                        print(f"[✓] {file} → 일련번호: {serial}")
                    else:
                        dest_path = os.path.join(NON_SERIAL_FOLDER, file)
                        shutil.copy2(img_path, dest_path)
                        print(f"[ ] {file} → 일반 사진")
                except Exception as e:
                    print(f"[!] 오류: {file} → {e}")


if __name__ == '__main__':
#  classify_and_extract()

    #img_path = "/Volumes/ExSSD/Working/용공추 사진/2,3월 데이터/Pole/20250218_114838.jpg"
    img_path = '/Volumes/ExSSD/Working/용공추 사진/2,3월 데이터/Pole/20250307_153821.jpg'
    #img_path ="/Volumes/ExSSD/Working/용공추 사진/2,3월 데이터/Pole/20250303_121704.jpg"

    image = Image.open(img_path)


    # image = cv2.imread(img_path, cv2.IMREAD_COLOR)
    # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # kernel = np.ones((1, 1), np.uint8)
    # denoised = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

    data = pytesseract.image_to_data(image, output_type=Output.DICT, lang='eng+kor')

    number = find_number_below_security_light_tesseract(data)
    print("보안등 아래 숫자:", number)

    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, lang='kor+eng', config=custom_config)

    print(text)