자료수집: 상품 리스트 정보 추출법 (노가다 방식)

핵심 원리

공통 구조 파악: 대부분의 쇼핑몰은 상품 리스트를 반복되는 DOM 블록으로 렌더링합니다.
상품 단위로 반복문: 상품 하나를 감싸는 공통 class나 태그를 기준으로 리스트 추출.
개별 정보 추출: 썸네일, 가격, 링크 등은 내부의 태그에서 CSS selector로 추출.

브라우저 DevTools의 Console 이용해서 직접 추출

javascript

// 콘솔에서 상품 목록을 추출
Array.from(document.querySelectorAll('.product-card')).map(el => ({
    title: el.querySelector('.product-title')?.innerText,
    price: el.querySelector('.product-price')?.innerText,
    link: el.querySelector('a')?.href,
    thumbnail: el.querySelector('img')?.src
}))

BeautifulSoup 도구 이용 코드 예시 (정적 페이지 기준)

bash

pip install requests beautifulsoup4 lxml

python

import requests
from bs4 import BeautifulSoup

# 예: 쿠팡 베스트 상품 리스트 페이지
url = "https://www.example.com/category/top-items"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

# 상품 블록 선택자 (쇼핑몰마다 다름, 확인 필요)
products = soup.select(".product-card")  # 예시 class명

for product in products:
    try:
        title = product.select_one(".product-title").text.strip()
        price = product.select_one(".product-price").text.strip()
        link = product.select_one("a")["href"]
        thumbnail = product.select_one("img")["src"]

        # 절대 URL 처리
        if link.startswith("/"):
            link = f"https://www.example.com{link}"
        if thumbnail.startswith("//"):
            thumbnail = "https:" + thumbnail

        print({
            "title": title,
            "price": price,
            "link": link,
            "thumbnail": thumbnail
        })

    except Exception as e:
        print("Error parsing product:", e)

Selenium 도구 이용 코드 예시 (로그인 필요하거나 JS로 렌더링되는 동적 페이지)

bash

pip install selenium

python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

driver.get("https://www.example.com/login")
# 필요한 경우 로그인
# driver.find_element(...).send_keys(...)
# driver.find_element(...).click()

# 로그인 후 이동
driver.get("https://www.example.com/category/top-items")
time.sleep(3)  # JS 로딩 대기

soup = BeautifulSoup(driver.page_source, "lxml")

products = soup.select(".product-card")

for product in products:
    title = product.select_one(".product-title").text.strip()
    # 나머지는 위와 동일

driver.quit()

다중 사이트 대응을 위한 설계 아이디어

python

shop_config = {
    "coupang": {
        "base_url": "https://www.coupang.com",
        "product_selector": ".baby-product",
        "title_selector": ".name",
        "price_selector": ".price-value",
        "thumbnail_selector": "img",
        "link_selector": "a"
    },
    "gmarket": {
        "base_url": "https://www.gmarket.co.kr",
        ...
    }
}

python

def parse_product(product, config):
    def safe_select(css):
        el = product.select_one(css)
        return el.text.strip() if el else ""

    def safe_attr(css, attr):
        el = product.select_one(css)
        return el[attr] if el and attr in el.attrs else ""

    return {
        "title": safe_select(config["title_selector"]),
        "price": safe_select(config["price_selector"]),
        "link": config["base_url"] + safe_attr(config["link_selector"], "href"),
        "thumbnail": safe_attr(config["thumbnail_selector"], "src"),
    }