python利用xpath爬取网上数据并存储到django模型中

帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据

1.设计数据库

from django.db import models
from uuslug import slugify
import uuid
import os


def products_directory_path(instance, filename):
  ext = filename.split(\'.\')[-1]
  filename = \'{}.{}\'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join(\'images\', \"products\", instance.title, filename)


def product_relatedimage_directory_path(instance, filename):
  ext = filename.split(\'.\')[-1]
  filename = \'{}.{}\'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join(\'images\', \"product_relatedimage\", instance.product.title, filename)


class ProductsCategory(models.Model):
  \"\"\"产品分类\"\"\"
  name = models.CharField(\'产品分类名\', max_length=80, unique=True)
  description = models.TextField(\'产品分类描述\', blank=True, null=True)
  slug = models.SlugField(\'slug\', max_length=80, blank=True, null=True)
  parent_category = models.ForeignKey(\'self\', verbose_name=\"父级分类\", blank=True, null=True, on_delete=models.CASCADE)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  def __str__(self):
    return self.name

  class Meta:
    ordering = [\'name\']
    verbose_name = \"产品分类\"
    verbose_name_plural = verbose_name


class ProductsTag(models.Model):
  \"\"\"产品标签\"\"\"
  name = models.CharField(\'产品标签名\', max_length=30, unique=True)
  slug = models.SlugField(\'slug\', max_length=40)

  def __str__(self):
    return self.name

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  class Meta:
    ordering = [\'name\']
    verbose_name = \"产品标签\"
    verbose_name_plural = verbose_name


class Product(models.Model):
  title = models.CharField(\'标题\', max_length=255, unique=True)
  slug = models.SlugField(\'slug\', max_length=255, blank=True, null=True)
  jscs = models.TextField(\'技术参数\', blank=True, null=True)
  image = models.ImageField(upload_to=products_directory_path, verbose_name=\"产品图片\")
  views = models.PositiveIntegerField(\'浏览量\', default=0)
  category = models.ForeignKey(\'ProductsCategory\', verbose_name=\'分类\', on_delete=models.CASCADE, blank=True, null=True)
  tags = models.ManyToManyField(\'ProductsTag\', verbose_name=\'标签集合\', blank=True)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.title)
    super().save(*args, **kwargs)

  def update_views(self):
    self.views += 1
    self.save(update_fields=[\'views\'])

  def get_pre(self):
    return Product.objects.filter(id__lt=self.id).order_by(\'-id\').first()

  def get_next(self):
    return Product.objects.filter(id__gt=self.id).order_by(\'id\').first()

  def __str__(self):
    return self.title

  class Meta:
    verbose_name = \"产品\"
    verbose_name_plural = verbose_name


class ProductAdvantage(models.Model):
  content = models.TextField(\'产品优势\', blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.content

  class Meta:
    verbose_name = \"产品优势\"
    verbose_name_plural = verbose_name


class ProductBody(models.Model):
  body = models.CharField(\'产品内容\', max_length=256, blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.product.title

  class Meta:
    verbose_name = \"产品内容\"
    verbose_name_plural = verbose_name

2.脚本编写

2.1编写获取网页源代码函数

def get_one_page(url):
  try:
    headers = {
      \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}
    res = requests.get(url=url, headers=headers)
    res.encoding = \'utf-8\'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None

2.2根据base页面获取所有产品分类页面链接

if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath(\'//div[@class=\"fdh-01-nav\"]/div/h3/a/@href\')
  # 处理catgory_urls
  for url in catgory_urls:
    url = \'http://www.kexinjianji.com\' + url
    print(url)

2.3根据产品分类页面链接获取对应所有产品链接

if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类
  catgory = tree.xpath(\'//div[@class=\"cplb-3n-ts-03 b\"]/h3/span/text()\')
  print(\"产品分类:\" + catgory[0])
  # 该分类下产品url
  urls = tree.xpath(\'//div[@class=\"cplb-3n-ts-03-list\"]/dl/dt/a/@href\')
  # 处理url
  for url in urls:
    url = \'http://www.kexinjianji.com\' + url
    print(url)
  print(\"=====================================================\")

两者结合起来就可以打印出所有产品链接

if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath(\'//div[@class=\"fdh-01-nav\"]/div/h3/a/@href\')
  # 处理catgory_urls
  for url in catgory_urls:
    url = \'http://www.kexinjianji.com\' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    catgory = tree.xpath(\'//div[@class=\"cplb-3n-ts-03 b\"]/h3/span/text()\')
    print(\"产品分类:\" + catgory[0])
    # 该分类下产品url
    urls = tree.xpath(\'//div[@class=\"cplb-3n-ts-03-list\"]/dl/dt/a/@href\')
    # 处理url
    for url in urls:
      url = \'http://www.kexinjianji.com\' + url
      print(url)
    print(\"=====================================================\")

2.2使用xpath解析函数返回产品链接的内容

if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品名称
  title = tree.xpath(\'//*[@id=\"wrap\"]//h1/text()\')
  images = tree.xpath(\'//div[@class=\"sol_tj_left\"]/a/img/@src\')
  # 产品图片
  images_url = \'http://www.kexinjianji.com/\' + images[0]
  # 性能特点
  xntd = tree.xpath(\'//div[@class=\"w\"]//div/span/text()|//div[@class=\"w\"]//div/text()\')
  # 技术参数
  jscs = tree.xpath(\'//table\')[0]
  jscs_str = etree.tostring(jscs, encoding=\'utf-8\').decode(\'utf-8\')
  # 产品内容
  cpnr = tree.xpath(\'//div[@class=\"describe\"]/p\')
  print(\'产品名称:\' + title[0])
  print(\'产品图片:\' + images_url)
  for td in xntd:
    print(\'性能特点:\' + td)
  print(\'技术参数:\' + jscs_str)
  for cp in cpnr:
    # string(.) 获取当前标签下所有文本内容
    cp = cp.xpath(\'string(.)\')
    print(\'产品内容:\' + cp)
  print(\'============================================\')

将三者结合在一起就可以获取所有产品信息

if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath(\'//div[@class=\"fdh-01-nav\"]/div/h3/a/@href\')
  # 处理catgory_urls
  for url in catgory_urls:
    url = \'http://www.kexinjianji.com\' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    catgory = tree.xpath(\'//div[@class=\"cplb-3n-ts-03 b\"]/h3/span/text()\')
    # 该分类下产品url
    urls = tree.xpath(\'//div[@class=\"cplb-3n-ts-03-list\"]/dl/dt/a/@href\')
    # 处理url
    for url in urls:
      url = \'http://www.kexinjianji.com\' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        # 产品名称
        title = tree.xpath(\'//*[@id=\"wrap\"]//h1/text()\')
        images = tree.xpath(\'//div[@class=\"sol_tj_left\"]/a/img/@src\')
        # 产品图片
        images_url = \'http://www.kexinjianji.com\' + images[0]
        # 性能特点
        xntd = tree.xpath(\'//div[@class=\"w\"]//div/span/text()|//div[@class=\"w\"]//div/text()\')
        # 技术参数
        jscs = tree.xpath(\'//table\')[0]
        jscs_str = etree.tostring(jscs, encoding=\'utf-8\').decode(\'utf-8\')
        # 产品内容
        cpnr = tree.xpath(\'//div[@class=\"describe\"]/p\')
        print(\"产品分类:\" + catgory[0])
        print(\'产品链接:\' + url)
        print(\'产品名称:\' + title[0])
        print(\'产品图片:\' + images_url)
        for td in xntd:
          print(\'性能特点:\' + td.strip())
        # print(\'技术参数:\' + jscs_str)
        for cp in cpnr:
          # string(.) 获取当前标签下所有文本内容
          cp = cp.xpath(\'string(.)\')
          print(\'产品内容:\' + cp)
        print(\'============================================\')
      except Exception as e:
        print(e)
        print(\'出错url:\' + url)
        pass

3.存储到django模型

import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile

os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"jiaobanzhan.settings\")
django.setup()

from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage

url = \'http://www.kexinjianji.com/product/hzshntjbz_1/\'


def get_one_page(url):
  try:
    headers = {
      \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}
    res = requests.get(url=url, headers=headers, timeout=10)
    res.encoding = \'utf-8\'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    print(\'aa\')
    return None


if __name__ == \'__main__\':
  content = get_one_page(url)
  tree = etree.HTML(content)
  # 产品分类url
  catgory_urls = tree.xpath(\'//div[@class=\"fdh-01-nav\"]/div/h3/a/@href\')
  # 处理catgory_urls
  for url in catgory_urls:
    url = \'http://www.kexinjianji.com\' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    # 产品分类
    p_catgory = tree.xpath(\'//div[@class=\"cplb-3n-ts-03 b\"]/h3/span/text()\')
    # 该分类下产品url
    urls = tree.xpath(\'//div[@class=\"cplb-3n-ts-03-list\"]/dl/dt/a/@href\')
    # 处理url
    for url in urls:
      url = \'http://www.kexinjianji.com\' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        # 产品名称
        title = tree.xpath(\'//*[@id=\"wrap\"]//h1/text()\')
        images = tree.xpath(\'//div[@class=\"sol_tj_left\"]/a/img/@src\')
        # 产品图片
        images_url = \'http://www.kexinjianji.com\' + images[0]
        # 性能特点
        xntd = tree.xpath(\'//div[@class=\"w\"]//div/span/text()|//div[@class=\"w\"]//div/text()\')
        # 技术参数
        jscs = tree.xpath(\'//table\')[0]
        jscs_str = etree.tostring(jscs, encoding=\'utf-8\').decode(\'utf-8\')
        # 产品内容
        cpnr = tree.xpath(\'//div[@class=\"describe\"]/p\')
        # 判断是否有这分类,没有则新建
        catgory = p_catgory[0]
        products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
        if products_catgory:
          products_catgory = ProductsCategory.objects.get(name=catgory)
        else:
          products_catgory = ProductsCategory(name=catgory)
          products_catgory.save()
        print(products_catgory)

        # 保存产品图片
        image_content = requests.get(url=images_url)
        ext = images_url.split(\'.\')[-1] # 获取图片类型
        filename = \'{}.{}\'.format(uuid.uuid4().hex[:8], ext) # 随机生成图片名字
        upload_image_file = ContentFile(image_content.content, name=filename) # 将图片保存为django类型
        product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
        product.save()
        for td in xntd:
          product_advantage = ProductAdvantage()
          product_advantage.content = td
          product_advantage.product = product
          product_advantage.save()
        for cp in cpnr:
          cp = cp.xpath(\'string(.)\')
          product_body = ProductBody()
          product_body.body = cp
          product_body.product = product
          product_body.save()
      except Exception as e:
        print(e)
        print(\'出错url:\' + url)

最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)

4.总结

1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下

<div class=\"describe\" style=\"position: relative;\"> 
   <p><span>板  宽:</span>1500mm</p> 
   <p><span>板  厚:</span>4.5 mm</p> 
   <p><span>出料口:</span>6口</p> 
   <p><span>重  量:</span>6000 kg</p>
</div>

使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果

//div[@class=\"describe\"]/p/span/text()|//div[@class=\"describe\"]/p/text()

百度之后找到的解决办法,使用xpath(‘string(.)\’)
1.先获取所有p标签

cpnr = tree.xpath(\'//div[@class=\"describe\"]/p\')

2.使用**string(.)**获取所有标签所有文本

cp = cp.xpath(\'string(.)\')

循环遍历所有p标签即可

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容