python 爬蟲程式 -1

做一個抓取 104人力銀行 新鮮人找工作版 所有工作及公司資料的程式

首先要抓搜尋結果頁面的資訊


  1. import requests
  2. from bs4 import BeautifulSoup as bs
  3. search_url = "https://www.104.com.tw/area/freshman/search?keyword=%E6%A9%9F%E5%99%A8%E4%BA%BA&area=6001001000,6001002000&jobcategory=2007000000&industry=&page=1&sortField=APPEAR_DATE&sortMode=DESC"
  4. head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
  5. 'Accept-Language':'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4'}
  6. res = requests.get(search_url, headers = head)
  7. print(res.encoding)
  8. res.encoding = 'utf8'
  9. soup = bs(res.text)
  10.  
Out[1]:ISO-8859-1
用request傳回網頁內容
res.encoding可得知網頁編碼方式,一開始我根據網站上寫utf-8編碼,跟python設定編碼一致,想說沒問題,之後才找到如何知道是用甚麼編碼的方法,print出來才知道被python判斷成ISO-8859-1,那就自行設定成utf-8
BeautifulSoup 是一個能解構及分析的一套很好用的工具,將res轉成文字"res.text",丟給 BeautifulSoup

抓取一個工作資料


用select擷取出我們要的資訊在哪個區域

job_box = soup.select(".job_box")[0]
job_box.select('.joblist_cont')[0]

job_name = joblist.select('a')[0]['title']    #job name
com_name = joblist.select('a')[1]['title']    #company name
edu = joblist.select(".edu")[0].text.replace('\t','').replace('\r\n','')   #學歷
area = joblist.select(".area")[0].text.replace('\t','').replace('\r\n','') #工作地點在哪個市區
job_url = "https://www.104.com.tw" + joblist.select('a')[0]['href']        #職缺網頁
com_url = "https://www.104.com.tw" + joblist.select('a')[1]['href']        #公司簡介網頁

開啟職缺網頁及公司簡介網頁並取得資料
def load_web(url):
    web_res = requests.get(url)
    web_res.encoding = 'utf8'
    #print(job_web_res.text)
    time.sleep(random.uniform(0,2))
    web_bs = bs(web_res.text)
    return web_bs

job_web_bs = load_web(job_url)
com_web_bs = load_web(com_url)
   
# number of people apply for a job
num = num_of_people(job_web_bs)       
#job information
job_info = load_job_info(job_web_bs)
# company information
com_info = load_com_info(com_web_bs)
用hash sort的方式儲存我們要的資料
  1. job_info = {} #先建一個空間
  2. title1 = '工作內容'
  3. title2 = '工作說明'
  4. explanation = 'xxxxxxx'
  5. if title1 not in job_info.keys(): #當不存在'工作內容'這個索引值,建立'工作內容'這個索引值
  6. job_info[title1] = {}
  7. if title2 not in job_info[title1].keys(): #當不存在'工作說明'這個索引值,建立'工作說明'這個索引值
  8. job_info[title1][title2] = {}
  9. job_info[title1][title2] = explanation #將'xxxxxxx'放在 job_info[工作內容][工作說明]裡

舉例:職缺網頁
# job info
def load_job_info(job_web_bs):
    job_info = {}
    main_info = job_web_bs.select("section")[0]
    title1 = main_info.select('h2')[0].text      #工作內容
    title2 = '工作說明'
    explanation = main_info.select('p')[0].text.replace('\r', ' ')
    
    if title1 not in job_info.keys():
        job_info[title1] = {}
    if title2 not in job_info[title1].keys():
        job_info[title1][title2] = {}
    job_info[title1][title2] = explanation
            
    for i in range(len(main_info.select('dt'))):
        title2 = main_info.select('dt')[i].text.replace('\r','').replace('\t','').replace(':','')
        explanation = main_info.select('dd')[i].text.replace(' ','').replace('\n','')                       
        if title2 not in job_info[title1].keys():
            job_info[title1][title2] = {}
        job_info[title1][title2] = explanation
               
    # job condition
    condition = job_web_bs.select("section")[1]
    title1 = condition.select('h2')[0].text      #工作條件
    if title1 not in job_info.keys():
        job_info[title1] = {}
        
    for i in range(len(condition.select('dt'))):
        title2 = condition.select('dt')[i].text.replace(':','')
        explanation = condition.select('dd')[i].text.replace(' ','')  
        if title2 not in job_info[title1].keys():
            job_info[title1][title2] = {}
        job_info[title1][title2] = explanation
        
    # work welfare
    welfare = job_web_bs.select("section")[2]
    title1 = welfare.select('h2')[0].text        #公司福利
    explanation = welfare.select('p')[0].text.replace('\r\u3000','\t').replace('\r','')
    if title1 not in job_info.keys():
        job_info[title1] = {}
    job_info[title1] = explanation
    return job_info




寫檔案的格式問題:

上網查excel不是用utf-8編碼,python寫檔時也有錯誤訊息,寫說'cp950' codec can't encode character '\u30fb' in position 0: illegal multibyte sequence,所以就全部經由cp950編碼一次測試,如果有字元不能被編碼那就以空白字元代替
要先將string轉為list,在對特定字元做設定,之後再轉回string格式

wl = '"工作內容","工作說明","xxxx"'
for j in range(len(wl)):
    try:
        wl[j].encode('cp950').decode('cp950')
    except:
        tmp = list(wl)
        tmp[j] = " "
        wl = ''.join(tmp)


完整程式碼:

  1. import requests
  2. from bs4 import BeautifulSoup as bs
  3. import time
  4. import pandas as pd
  5. import chardet
  6. import random
  7.  
  8. # get title company there's link
  9. def load_web(url):
  10. web_res = requests.get(url)
  11. web_res.encoding = 'utf8'
  12. #print(job_web_res.text)
  13. time.sleep(random.uniform(0,2))
  14. web_bs = bs(web_res.text)
  15. return web_bs
  16.  
  17. def writeline(array):
  18. wl = '"' + array[0] + '"'
  19. for i in range(1, len(array)):
  20. wl = wl + ',"' + array[i] + '"'
  21. return wl
  22.  
  23. def get_all_page(beautifulsoup_text):
  24. page = beautifulsoup_text.select('.next_page')[0]
  25. tt = page.text.replace('\t','').split('\n')
  26. num = tt[1].find('共')
  27. allpage = int(tt[1][num+2])
  28. return allpage
  29.  
  30. class job_class(object):
  31. def __init__(self, joblist):
  32. self.job_name = joblist.select('a')[0]['title'] #job name
  33. self.com_name = joblist.select('a')[1]['title'] #company name
  34. self.edu = joblist.select(".edu")[0].text.replace('\t','').replace('\r\n','')
  35. self.area = joblist.select(".area")[0].text.replace('\t','').replace('\r\n','')
  36. self.job_url = "https://www.104.com.tw" + joblist.select('a')[0]['href']
  37. self.com_url = "https://www.104.com.tw" + joblist.select('a')[1]['href']
  38. # job info
  39. def load_job_info(job_web_bs):
  40. job_info = {}
  41. main_info = job_web_bs.select("section")[0]
  42. title1 = main_info.select('h2')[0].text #工作內容
  43. title2 = '工作說明'
  44. explanation = main_info.select('p')[0].text.replace('\r', ' ')
  45. if title1 not in job_info.keys():
  46. job_info[title1] = {}
  47. if title2 not in job_info[title1].keys():
  48. job_info[title1][title2] = {}
  49. job_info[title1][title2] = explanation
  50. for i in range(len(main_info.select('dt'))):
  51. title2 = main_info.select('dt')[i].text.replace('\r','').replace('\t','').replace(':','')
  52. explanation = main_info.select('dd')[i].text.replace(' ','').replace('\n','')
  53. if title2 not in job_info[title1].keys():
  54. job_info[title1][title2] = {}
  55. job_info[title1][title2] = explanation
  56. # job condition
  57. condition = job_web_bs.select("section")[1]
  58. title1 = condition.select('h2')[0].text #工作條件
  59. if title1 not in job_info.keys():
  60. job_info[title1] = {}
  61. for i in range(len(condition.select('dt'))):
  62. title2 = condition.select('dt')[i].text.replace(':','')
  63. explanation = condition.select('dd')[i].text.replace(' ','')
  64. if title2 not in job_info[title1].keys():
  65. job_info[title1][title2] = {}
  66. job_info[title1][title2] = explanation
  67. # work welfare
  68. welfare = job_web_bs.select("section")[2]
  69. title1 = welfare.select('h2')[0].text #公司福利
  70. explanation = welfare.select('p')[0].text.replace('\r\u3000','\t').replace('\r','')
  71. if title1 not in job_info.keys():
  72. job_info[title1] = {}
  73. job_info[title1] = explanation
  74. return job_info
  75. # company info
  76. def load_com_info(com_web_bs):
  77. com_info = {}
  78. com_info_bs = com_web_bs.select(".intro")[0]
  79. title1 = com_info_bs.select('h2')[0].text #公司介紹
  80. if title1 not in com_info.keys():
  81. com_info[title1] = {}
  82. for i in range(len(com_info_bs.select('dt'))):
  83. title2 = com_info_bs.select('dt')[i].text.replace('\u3000','').replace(':','')
  84. explanation = com_info_bs.select('dd')[i].text.replace('\r','').replace('\n','')
  85. if title2 not in com_info[title1].keys():
  86. com_info[title1][title2] = {}
  87. com_info[title1][title2] = explanation
  88. com_info_bs = com_web_bs.select(".intro")[1]
  89. title1 = com_info_bs.select('h2')[0].text #公司簡介
  90. explanation = com_info_bs.select('p')[0].text.replace('\r','\n')
  91. if title1 not in com_info.keys():
  92. com_info[title1] = {}
  93. com_info[title1] = explanation
  94. title1 = com_info_bs.select('h2')[1].text #主要商品/服務項目
  95. explanation = com_info_bs.select('p')[1].text.replace('\r','')
  96. if title1 not in com_info.keys():
  97. com_info[title1] = {}
  98. com_info[title1] = explanation
  99. return com_info
  100. # number of people apply for a job
  101. def num_of_people(job_web_bs):
  102. number_of_people_bs = job_web_bs.select('.sub')[0]
  103. number_of_people = number_of_people_bs.select('a')[0].text
  104. return number_of_people
  105.  
  106. def load_104_newpeople_main(search_url, filename = './test.csv'):
  107. print('write to file ', filename)
  108.  
  109. job_list = [[ '' , '工作內容', '條件要求', '公司福利'],
  110. ['工作說明', '職務類別', '工作待遇', '工作性質', '上班地點', '管理責任', '出差外派', '上班時段', '休假制度', '可上班日', '需求人數'],
  111. ['接受身份', '工作經歷', '學歷要求', '科系要求', '語文條件', '擅長工具', '工作技能', '具備駕照', '其他條件']]
  112. com_list = [[ '' , '公司介紹', '公司簡介', '主要商品/服務項目'],
  113. ['產業類別', '產業描述', '員工', '資\xa0本\xa0額', '聯\xa0絡\xa0人', '公司地址', '電話', '傳真', '公司網址']]
  114. all_headlist = ['職務名稱','公司名稱','學歷', '地區', '應徵人數',
  115. '工作說明', '職務類別', '工作待遇', '工作性質', '上班地點', '管理責任', '出差外派', '上班時段', '休假制度', '可上班日', '需求人數', '接受身份', '工作經歷', '學歷要求', '科系要求', '語文條件', '擅長工具', '工作技能', '具備駕照', '其他條件', '公司福利',
  116. '產業類別', '產業描述', '員工', '資本額', '聯絡人', '公司地址', '電話', '傳真', '公司網址', '公司簡介', '主要商品/服務項目']
  117. head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
  118. 'Accept-Language':'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4'}
  119. res = requests.get(search_url, headers = head)
  120. #print(res.encoding)
  121. res.encoding = 'utf8'
  122. soup = bs(res.text)
  123. wl = writeline(all_headlist)
  124. fw = open(filename, 'w')
  125. fw.writelines(wl + '\n')
  126. allpage = get_all_page(soup)+1
  127. for page in range(1, allpage):
  128. if page == 1:
  129. print('page:',page)
  130. print(search_url)
  131. else:
  132. print('page:', page)
  133. #url = 'https://www.104.com.tw/area/freshman/search?area=6001001000%2C6001002000&jobcategory=2007000000&industry=&keyword=%E6%A9%9F%E5%99%A8%E4%BA%BA&page=2&sortField=APPEAR_DATE&sortMode=DESC'
  134. findpage = 'page='
  135. local = search_url.find(findpage)
  136. tmp = list(search_url)
  137. tmp[local + len(findpage)] = str(page)
  138. search_url = ''.join(tmp)
  139. res = requests.get(search_url, headers = head)
  140. #print(res.encoding)
  141. res.encoding = 'utf8'
  142. soup = bs(res.text)
  143. print(search_url)
  144. job_box = soup.select(".job_box")[0]
  145. #job_url = ''
  146. #com_url = ''
  147. for joblis in job_box.select('.joblist_cont'):
  148. job = job_class(joblis)
  149. print(job.job_name)
  150. # print("job_url: ", job_url)
  151. # print("com url: ", com_url)
  152. global job_info
  153. global com_info
  154. job_web_bs = load_web(job.job_url)
  155. com_web_bs = load_web(job.com_url)
  156. # number of people apply for a job
  157. num = job_class.num_of_people(job_web_bs)
  158. #job info
  159. job_info = job_class.load_job_info(job_web_bs)
  160. # company
  161. com_info = job_class.load_com_info(com_web_bs)
  162. once_data = []
  163. once_data.append(job.job_name)
  164. once_data.append(job.com_name)
  165. once_data.append(job.edu)
  166. once_data.append(job.area)
  167. once_data.append(num)
  168. # job
  169. for j in range(len(job_list[1])):
  170. try:
  171. once_data.append(job_info[job_list[0][1]][job_list[1][j]])
  172. except:
  173. once_data.append('')
  174. for j in range(len(job_list[2])):
  175. try:
  176. once_data.append(job_info[job_list[0][2]][job_list[2][j]])
  177. except:
  178. once_data.append('')
  179. once_data.append(job_info[job_list[0][3]])
  180. # company
  181. for j in range(len(com_list[1])):
  182. try:
  183. once_data.append(com_info[com_list[0][1]][com_list[1][j]])
  184. except:
  185. once_data.append('')
  186. once_data.append(com_info[com_list[0][2]])
  187. once_data.append(com_info[com_list[0][3]])
  188. # do wl
  189. wl = writeline(once_data)
  190. for j in range(len(wl)):
  191. try:
  192. wl[j].encode('cp950').decode('cp950')
  193. except:
  194. tmp = list(wl)
  195. tmp[j] = " "
  196. wl = ''.join(tmp)
  197. #write to file
  198. fw.writelines(wl + '\n')
  199. break
  200. fw.close()
  201. print('output file succesful')
  202. return 'succesful'
  203.  
  204. if __name__ == '__main__':
  205. filename = './test.csv'
  206. search_url = "https://www.104.com.tw/area/freshman/search?keyword=%E6%A9%9F%E5%99%A8%E4%BA%BA&area=6001001000,6001002000&jobcategory=2007000000&industry=&page=1&sortField=APPEAR_DATE&sortMode=DESC"
  207.  
  208. load_104_newpeople_main(search_url, filename)


留言

  1. 太厲害了!最近才想用python寫一個可以固定擷取特定公司每日是否有釋放新職缺,並就工作性質分類。看到你的BLOG發現都寫好了XD

    回覆刪除
    回覆
    1. 台北班 Python + Excel VBA 金融資訊爬蟲課程 https://docs.google.com/forms/d/e/1FAIpQLSdL6NzcFFoitBT6zfkLWpOPghHjwOB9pXBWp5SJgduFM0EPDg/viewform

      刪除
  2. 很有用的教學!剛好要爬104,值得參考

    回覆刪除
  3. 作者已經移除這則留言。

    回覆刪除
    回覆
    1. 不好意思想請教板大是用什麼環境去跑的
      我用 jupyter 跑上面的完整程式碼好像跑不出來🥴

      刪除
    2. 可能對方網頁格式有改版吧,這種事常有

      刪除

張貼留言

這個網誌中的熱門文章

python nn 聲音辨識 -1 傅立葉轉換

android 定時通知(永久長期的) 本篇只講AlarmManager使用

C# 模擬鍵盤滑鼠控制電腦

raspberrypi 開機自動執行程式 與 在terminal開啟第二個terminal執行python

python pyautogui 簡介