1)lxml的使用方法
from lxml import etree my_page = '''Title 我的⽂章
- 北京
- 上海
- 深圳
- 武汉
我的⽹站
我的⽂章

更多详情
Python Python⼩⽩ Python进阶本站总访问量159323次
Copyright © 2019 - 2021 程序员zhenguo ''' html = etree.fromstring(my_page) # ⼀、定位 divs1 = html.xpath('//div') divs2 = html.xpath('//div[@id]') divs3 = html.xpath('//div[@]')2)音乐爬取案例
import time import requests import lxml.etree import os import random from lxml import etree # https://www.tuke88.com/peiyue/zonghe_0_1.html page_n = int(input('请输入你想要爬取的网页数量: ')) for i in range(page_n): url = f'https://www.tuke88.com/peiyue/zonghe_0_{i}.html' res = requests.get(url) # 第三步,用lxml框架提取html网页我们想要的内容 html_parser = lxml.etree.HTMLParser() html = lxml.etree.fromstring(res.text, parser=html_parser) titles = html.xpath("//div[@class='lmt']//div[@class='audio-list']//a[@class='title']/text()") print(titles) mp3_urls = html.xpath("//div[@class='lmt']//div[@class='audio-list']//source/@src") print(mp3_urls) if not os.path.exists('pymp3'): os.mkdir('pymp3') for title, mp3_url in zip(titles, mp3_urls): mp3Stream = requests.get(mp3_url, stream=True) with open(os.path.join('pymp3', title+".mp3"), "wb+") as f: f.write(mp3Stream.raw.read()) print(f'[info]{title}.mp3下载成功') time.sleep(random.uniform(0.1, 0.4)) #r.content是经过处理的(比如自动解码gzip,deflate),r.raw是原始数据(socket返回的内容) #deflate [dɪˈfleɪt] 压缩 //https://developer.mozilla.org/zh-CN/docs/Web javascript学习网址 //https://dabblet.com/ javascript在线编译环境
3)爬取c-log博客数据
import os import random import time import requests from lxml import etree import pdfkit author_name = input("请输入博主ID: ") MAX_PAGE = 200 i = 1 sess = requests.Session() agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0' sess.headers['User-Agent'] = agent def crawler_blog_by(author_name, article_id, title, i): article_request_url = f'https://blog.csdn.net/{author_name}/article/details/{article_id}?spm=1001.2100.3001.7377' # https://blog.csdn.net/{author_name}/article/details/{article_id}?spm=1001.2100.3001.7377 response = sess.get(article_request_url) selector = etree.HTML(response.text) head_msg = selector.xpath(r"//head")[0] head_str = etree.tostring(head_msg, encoding='utf8', method='html').decode() body_msg = selector.xpath(r"//div[@id='content_views']")[0] body_str = etree.tostring(body_msg, encoding='utf8', method='html').decode() if not os.path.exists("c_articles"): os.mkdir("c_articles") title = title.replace("/", "-").replace(":", "").replace(":", "") save_file_name = os.path.join("c_articles", f'{author_name}-{title}-{article_id}.html') with open(save_file_name, 'w', encoding='utf8') as f: f.write(f""" {body_str}""") print(f'[info] : {author_name}第{i}篇{title}-{article_id}.html保存成功') i += 1 # 循环爬取分页html for page_no in range(MAX_PAGE): try: data = {"page": page_no, "size": 20, "businessType": "blog", "orderby": "", "noMore": False, "year": "", "month": "", "username": author_name} # 'https://blog.csdn.net/community/home-api/v1/get-business-list?page=2&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=weixin_46274168 pages_dict = sess.get('https://blog.csdn.net/community/home-api/v1/get-business-list?page=2&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=weixin_46274168').json() for article in pages_dict['data']['list']: article_id = article['articleId'] title = article['title'] crawler_blog_by(author_name, article_id, title, i) time.sleep(random.uniform(0.4, 1.0)) except Exception as e: print(e)
4)密钥学习
1) os.path.splitext的使用,会返回两个值,分别为文件名和扩展名 import os path = '/usr/local/bin/python.exe' filename, ext = os.path.splitext(path) print('文件名:', filename) print('扩展名:', ext) # 文件名: /usr/local/bin/python # 扩展名: .exe https://www.liaoxuefeng.com/ 廖雪峰学习网址 //md5加密 import hashlib def getMd5(data): obj = hashlib.md5() obj.update(data.encode('utf-8')) return obj.hexdigest() print(getMd5('zhen guo')) //去除字符串首尾的空格或者特殊字符 str = "00000003210Runoob01230000000" #去除首尾字符 0 str1 = str.strip('0') print(str1) # 去除首尾空格 str2 = " Runoob " str3 = str2.strip() print(str3) pip install pycryptodome rsa加密 from Crypto import Random from Crypto.PublicKey import RSA from Crypto.Cipher import PKCS1_v1_5 as PKCS1_cipher random_generator = Random.new().read rsa = RSA.generate(2048, random_generator) # 生成公钥 public_key = rsa.publickey().exportKey() with open('public_a.rsa', 'wb') as f: f.write(public_key) print(public_key) # 生成私钥 private_key = rsa.exportKey() with open('private_a.rsa', 'wb') as f: f.write(private_key) print(private_key)
5)rsa加密解密案例
cipher [ˈsaɪfə] 密码的意思
from Crypto import Random from Crypto.PublicKey import RSA from Crypto.Cipher import PKCS1_v1_5 as PKCS1_cipher random_generator = Random.new().read rsa = RSA.generate(2048, random_generator) # 生成公钥 public_key = rsa.publickey().exportKey() with open('public_a.rsa', 'wb') as f: f.write(public_key) print(public_key) # 生成私钥 private_key = rsa.exportKey() with open('private_a.rsa', 'wb') as f: f.write(private_key) print(private_key) data = input('请输入待加密的文本: ') with open('public_a.rsa', 'r') as f: key = f.read() pub_key = RSA.importKey(str(key)) cipher = PKCS1_cipher.new(pub_key) # 输入的是文本,需要转换为字节类型 rsa_text = cipher.encrypt(data.encode('utf8')) # 发送给客户端,客户端中,公钥和私钥都是有的 with open('private_a.rsa', 'r') as f: key = f.read() pri_key = RSA.importKey(key) cipher = PKCS1_cipher.new(pri_key) raw_data = cipher.decrypt(rsa_text, 0) print(f"加密后的数据{rsa_text},解密后等于:{raw_data.decode('utf8')}")
6)enumerate和zip内置函数学习
enumerate的使用方法 s = [1, 2, 3, 4, 5] e = enumerate(s) print(e) #for index, value in e: print('%s, %s' % (index, value)) #字符串切片 title = 'adfefnfnnf;nfnnfnefn' title1 = title[:4] #out adfe print(title1) title2 = title[:-5] #out adfefnfnnf;nfnn print(title2) #什么叫对象id: #对象ID是用来唯一标识对象的值(身份:标签) range 用于生成一个整数序列 #zip函数使用 a = [3, 2, 1, 5, 67] b = [1, 6, 3, 90] for i, j in zip(a, b): print(f'i ={i},j={j}')
猜你喜欢
- 18天前(零碳中国·绿色投资蓝皮书)中国"零碳"差旅之路暨"绿色低碳酒店"标准研究项目成果发布会召开
- 18天前(东北地区全域旅游)东北三省一区宣传贯彻研学旅游行业标准
- 18天前(中国最好的避暑山庄)2025中国十大避暑山庄评选揭晓,澳涞山庄夺魁
- 18天前(武隆旅游门票)炸了!519中国旅游日武隆甩出王炸福利,59.9元通玩6大景点?!
- 18天前(夏日纵享 邂逅双面姑苏是哪一集)夏日纵享 邂逅双面姑苏
- 18天前(岭南东方大酒店)粤西成势 | 阳江阳春长兴岭南东方酒店正式签约,粤西文旅再添明珠
- 18天前(上海迪士尼 夏天)酷爽夏日,奇妙相伴!来上海迪士尼度假区清凉入夏
- 18天前(世茂海峡大厦多高)巴西地产高管齐聚厦门世茂海峡大厦 共探超高层建筑锻造经验
- 18天前(内蒙古交通旅游图)内蒙古着力提升交通与旅游服务水平
- 18天前(殷建祥简历)全国十大牛商解码:殷建祥如何用178天技术突围打造星空梦星空房
网友评论
- 搜索
- 最新文章
- (2020广州车展哈弗)你的猛龙 独一无二 哈弗猛龙广州车展闪耀登场
- (哈弗新能源suv2019款)智能科技颠覆出行体验 哈弗重塑新能源越野SUV价值认知
- (2021款全新哈弗h5自动四驱报价)新哈弗H5再赴保障之旅,无惧冰雪护航哈弗全民电四驱挑战赛
- (海南航空现况怎样)用一场直播找到市场扩张新渠道,海南航空做对了什么?
- (visa jcb 日本)优惠面面俱到 JCB信用卡邀您畅玩日本冰雪季
- (第三届“堡里有年味·回村过大年”民俗花灯会活动)第三届“堡里有年味·回村过大年”民俗花灯会活动
- (展示非遗魅力 长安启源助力铜梁龙舞出征)展示非遗魅力 长安启源助力铜梁龙舞出征
- (阿斯塔纳航空公司)阿斯塔纳航空机队飞机数量增至50架
- (北京香港航班动态查询)香港快运航空北京大兴新航线今日首航
- (我在港航“呵护”飞机 每一次安全着陆就是最好的荣誉)我在港航“呵护”飞机 每一次安全着陆就是最好的荣誉
- 热门文章